From 993b7db88e5d9c7549edcdfd34723f38153b18c8 Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Mon, 28 Jul 2025 03:45:55 +0000 Subject: [PATCH 1/7] add healthcheck orchestration logic Signed-off-by: Arjun Raja Yogidas --- cmd/nerdctl/container/container_create.go | 4 - ...o => container_health_check_linux_test.go} | 253 +++++++++++++++--- cmd/nerdctl/container/container_run.go | 10 +- cmd/nerdctl/container/container_run_test.go | 6 + cmd/nerdctl/helpers/flagutil.go | 6 +- go.mod | 2 +- pkg/api/types/container_types.go | 13 +- pkg/cmd/container/create.go | 3 - pkg/cmd/container/health_check.go | 1 - pkg/cmd/container/kill.go | 6 + pkg/cmd/container/remove.go | 6 + pkg/cmd/container/stop.go | 4 + pkg/containerutil/containerutil.go | 28 ++ pkg/healthcheck/executor.go | 40 ++- pkg/healthcheck/health.go | 13 +- pkg/healthcheck/healthcheck_manager_darwin.go | 43 +++ .../healthcheck_manager_freebsd.go | 43 +++ pkg/healthcheck/healthcheck_manager_linux.go | 214 +++++++++++++++ .../healthcheck_manager_windows.go | 43 +++ pkg/ocihook/ocihook.go | 4 + 20 files changed, 663 insertions(+), 79 deletions(-) rename cmd/nerdctl/container/{container_health_check_test.go => container_health_check_linux_test.go} (74%) create mode 100644 pkg/healthcheck/healthcheck_manager_darwin.go create mode 100644 pkg/healthcheck/healthcheck_manager_freebsd.go create mode 100644 pkg/healthcheck/healthcheck_manager_linux.go create mode 100644 pkg/healthcheck/healthcheck_manager_windows.go diff --git a/cmd/nerdctl/container/container_create.go b/cmd/nerdctl/container/container_create.go index fc5cbdd97c8..e30d529481a 100644 --- a/cmd/nerdctl/container/container_create.go +++ b/cmd/nerdctl/container/container_create.go @@ -279,10 +279,6 @@ func createOptions(cmd *cobra.Command) (types.ContainerCreateOptions, error) { if err != nil { return opt, err } - opt.HealthStartInterval, err = cmd.Flags().GetDuration("health-start-interval") - if err != nil { - return opt, err - } opt.NoHealthcheck, err = cmd.Flags().GetBool("no-healthcheck") if err != nil { return opt, err diff --git a/cmd/nerdctl/container/container_health_check_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go similarity index 74% rename from cmd/nerdctl/container/container_health_check_test.go rename to cmd/nerdctl/container/container_health_check_linux_test.go index aa2d1603313..48c84618fcb 100644 --- a/cmd/nerdctl/container/container_health_check_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -19,7 +19,6 @@ package container import ( "encoding/json" "errors" - "fmt" "strings" "testing" "time" @@ -32,11 +31,16 @@ import ( "github.com/containerd/nerdctl/mod/tigron/tig" "github.com/containerd/nerdctl/v2/pkg/healthcheck" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" "github.com/containerd/nerdctl/v2/pkg/testutil" "github.com/containerd/nerdctl/v2/pkg/testutil/nerdtest" ) func TestContainerHealthCheckBasic(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -134,6 +138,10 @@ func TestContainerHealthCheckBasic(t *testing.T) { } func TestContainerHealthCheckAdvance(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -391,43 +399,6 @@ func TestContainerHealthCheckAdvance(t *testing.T) { } }, }, - { - Description: "Healthcheck emits large output repeatedly", - Setup: func(data test.Data, helpers test.Helpers) { - helpers.Ensure("run", "-d", "--name", data.Identifier(), - "--health-cmd", "yes X | head -c 60000", - "--health-interval", "1s", "--health-timeout", "2s", - testutil.CommonImage, "sleep", nerdtest.Infinity) - nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - }, - Cleanup: func(data test.Data, helpers test.Helpers) { - helpers.Anyhow("rm", "-f", data.Identifier()) - }, - Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { - for i := 0; i < 3; i++ { - helpers.Ensure("container", "healthcheck", data.Identifier()) - time.Sleep(2 * time.Second) - } - return helpers.Command("inspect", data.Identifier()) - }, - Expected: func(data test.Data, helpers test.Helpers) *test.Expected { - return &test.Expected{ - ExitCode: 0, - Output: expect.All(func(_ string, t tig.T) { - inspect := nerdtest.InspectContainer(helpers, data.Identifier()) - h := inspect.State.Health - debug, _ := json.MarshalIndent(h, "", " ") - t.Log(string(debug)) - assert.Assert(t, h != nil, "expected health state") - assert.Equal(t, h.Status, healthcheck.Healthy) - assert.Assert(t, len(h.Log) >= 3, "expected at least 3 health log entries") - for _, log := range h.Log { - assert.Assert(t, len(log.Output) >= 1024, fmt.Sprintf("each output should be >= 1024 bytes, was: %s", log.Output)) - } - }), - } - }, - }, { Description: "Health log in inspect keeps only the latest 5 entries", Setup: func(data test.Data, helpers test.Helpers) { @@ -602,3 +573,209 @@ func TestContainerHealthCheckAdvance(t *testing.T) { testCase.Run(t) } + +func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { + testCase := nerdtest.Setup() + testCase.Require = require.Not(nerdtest.Docker) + + testCase.SubTests = []*test.Case{ + { + Description: "Basic healthy container with systemd-triggered healthcheck", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "2s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + // Wait for a healthcheck to execute + time.Sleep(2 * time.Second) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + // Ensure proper cleanup of systemd units + helpers.Anyhow("stop", data.Identifier()) + time.Sleep(500 * time.Millisecond) // Allow systemd cleanup + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + assert.Assert(t, h != nil, "expected health state to be present") + assert.Equal(t, h.Status, "healthy") + assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") + }), + } + }, + }, + { + Description: "Kill stops healthcheck execution", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "1s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + time.Sleep(2 * time.Second) // Wait for at least one health check to execute + helpers.Ensure("kill", data.Identifier()) // Kill the container + time.Sleep(3 * time.Second) // Wait to allow any potential extra healthchecks (shouldn't happen) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + // Container is already killed, just remove it + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + assert.Assert(t, h != nil, "expected health state to be present") + assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry") + + // Get container FinishedAt timestamp + containerEnd, err := time.Parse(time.RFC3339Nano, inspect.State.FinishedAt) + assert.NilError(t, err, "parsing container FinishedAt") + + // Assert all healthcheck log start times are before container finished + for _, entry := range h.Log { + assert.NilError(t, err, "parsing healthcheck Start time") + assert.Assert(t, entry.Start.Before(containerEnd), "healthcheck ran after container was killed") + } + }), + } + }, + }, + } + testCase.Run(t) +} + +func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("systemd healthcheck tests are skipped in rootless environment") + } + testCase := nerdtest.Setup() + testCase.Require = require.Not(nerdtest.Docker) + + testCase.SubTests = []*test.Case{ + { + // Tests that CreateTimer() successfully creates systemd timer units and + // RemoveTransientHealthCheckFiles() properly cleans up units when container stops. + Description: "Systemd timer unit creation and cleanup", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo healthy", + "--health-interval", "1s", + testutil.CommonImage, "sleep", "30") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + // Wait longer for systemd timer creation and first healthcheck execution + time.Sleep(3 * time.Second) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + return helpers.Command("inspect", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(stdout string, t tig.T) { + // Get container ID and check systemd timer + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + + // Check systemd timer + result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, _ tig.T) { + // Verify that a timer exists for this specific container + assert.Assert(t, strings.Contains(stdout, containerID), + "expected to find nerdctl healthcheck timer containing container ID: %s", containerID) + }, + }) + // Stop container and verify cleanup + helpers.Ensure("stop", data.Identifier()) + time.Sleep(500 * time.Millisecond) // Allow cleanup to complete + + // Check that timer is gone + result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, _ tig.T) { + assert.Assert(t, !strings.Contains(stdout, containerID), + "expected nerdctl healthcheck timer for container ID %s to be removed after container stop", containerID) + + }, + }) + }), + } + }, + }, + { + Description: "Container restart recreates systemd timer", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "echo restart-test", + "--health-interval", "2s", + testutil.CommonImage, "sleep", "60") + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + time.Sleep(3 * time.Second) // Wait for initial timer creation + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + // Get container ID for verification + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + + // Step 1: Verify timer exists initially + result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + assert.Assert(t, strings.Contains(stdout, containerID), + "expected timer for container %s to exist initially", containerID) + }, + }) + + // Step 2: Stop container + helpers.Ensure("stop", data.Identifier()) + time.Sleep(1 * time.Second) // Allow cleanup + + // Step 3: Verify timer is removed after stop + result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + result.Run(&test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + assert.Assert(t, !strings.Contains(stdout, containerID), + "expected timer for container %s to be removed after stop", containerID) + }, + }) + + // Step 4: Restart container + helpers.Ensure("start", data.Identifier()) + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + time.Sleep(3 * time.Second) // Wait for timer recreation + + // Step 5: Verify timer is recreated after restart - this is our final verification + return helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: expect.ExitCodeNoCheck, + Output: func(stdout string, t tig.T) { + containerInspect := nerdtest.InspectContainer(helpers, data.Identifier()) + containerID := containerInspect.ID + assert.Assert(t, strings.Contains(stdout, containerID), + "expected timer for container %s to be recreated after restart", containerID) + }, + } + }, + }, + } + testCase.Run(t) +} diff --git a/cmd/nerdctl/container/container_run.go b/cmd/nerdctl/container/container_run.go index 67b797bdce9..70222cff71a 100644 --- a/cmd/nerdctl/container/container_run.go +++ b/cmd/nerdctl/container/container_run.go @@ -37,6 +37,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/defaults" "github.com/containerd/nerdctl/v2/pkg/errutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/logging" "github.com/containerd/nerdctl/v2/pkg/netutil" @@ -240,7 +241,6 @@ func setCreateFlags(cmd *cobra.Command) { cmd.Flags().Duration("health-timeout", 0, "Maximum time to allow one check to run (default: 30s)") cmd.Flags().Int("health-retries", 0, "Consecutive failures needed to report unhealthy (default: 3)") cmd.Flags().Duration("health-start-period", 0, "Start period for the container to initialize before starting health-retries countdown") - cmd.Flags().Duration("health-start-interval", 0, "Time between running the checks during the start period") cmd.Flags().Bool("no-healthcheck", false, "Disable any container-specified HEALTHCHECK") // #region env flags @@ -445,6 +445,14 @@ func runAction(cmd *cobra.Command, args []string) error { return err } + // Setup container healthchecks. + if err := healthcheck.CreateTimer(ctx, c); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, c); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if createOpt.Detach { fmt.Fprintln(createOpt.Stdout, id) return nil diff --git a/cmd/nerdctl/container/container_run_test.go b/cmd/nerdctl/container/container_run_test.go index 64692a3ead7..e3d50940f8b 100644 --- a/cmd/nerdctl/container/container_run_test.go +++ b/cmd/nerdctl/container/container_run_test.go @@ -841,6 +841,9 @@ func TestRunDomainname(t *testing.T) { } func TestRunHealthcheckFlags(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } testCase := nerdtest.Setup() testCases := []struct { @@ -990,6 +993,9 @@ func TestRunHealthcheckFlags(t *testing.T) { } func TestRunHealthcheckFromImage(t *testing.T) { + if rootlessutil.IsRootless() { + t.Skip("healthcheck tests are skipped in rootless environment") + } nerdtest.Setup() dockerfile := fmt.Sprintf(`FROM %s diff --git a/cmd/nerdctl/helpers/flagutil.go b/cmd/nerdctl/helpers/flagutil.go index 32217ae95c6..22fc1acb1bf 100644 --- a/cmd/nerdctl/helpers/flagutil.go +++ b/cmd/nerdctl/helpers/flagutil.go @@ -52,8 +52,7 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { options.HealthInterval != 0 || options.HealthTimeout != 0 || options.HealthRetries != 0 || - options.HealthStartPeriod != 0 || - options.HealthStartInterval != 0 + options.HealthStartPeriod != 0 if options.NoHealthcheck { if options.HealthCmd != "" || healthFlagsSet { @@ -74,9 +73,6 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error { if options.HealthStartPeriod < 0 { return fmt.Errorf("--health-start-period cannot be negative") } - if options.HealthStartInterval < 0 { - return fmt.Errorf("--health-start-interval cannot be negative") - } return nil } diff --git a/go.mod b/go.mod index 6d554b021f3..23c621b97d1 100644 --- a/go.mod +++ b/go.mod @@ -120,7 +120,7 @@ require ( github.com/santhosh-tekuri/jsonschema/v6 v6.0.1 // indirect github.com/sasha-s/go-deadlock v0.3.5 // indirect //gomodjail:unconfined - github.com/sirupsen/logrus v1.9.3 // indirect + github.com/sirupsen/logrus v1.9.3 github.com/smallstep/pkcs7 v0.1.1 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect diff --git a/pkg/api/types/container_types.go b/pkg/api/types/container_types.go index 3e157bb303d..a19fb5aea1f 100644 --- a/pkg/api/types/container_types.go +++ b/pkg/api/types/container_types.go @@ -292,13 +292,12 @@ type ContainerCreateOptions struct { ImagePullOpt ImagePullOptions // Healthcheck related fields - HealthCmd string - HealthInterval time.Duration - HealthTimeout time.Duration - HealthRetries int - HealthStartPeriod time.Duration - HealthStartInterval time.Duration - NoHealthcheck bool + HealthCmd string + HealthInterval time.Duration + HealthTimeout time.Duration + HealthRetries int + HealthStartPeriod time.Duration + NoHealthcheck bool // UserNS name for user namespace mapping of container UserNS string diff --git a/pkg/cmd/container/create.go b/pkg/cmd/container/create.go index a0c8fc2bf9b..232d8a27b77 100644 --- a/pkg/cmd/container/create.go +++ b/pkg/cmd/container/create.go @@ -891,9 +891,6 @@ func withHealthcheck(options types.ContainerCreateOptions, ensuredImage *imgutil if options.HealthStartPeriod != 0 { hc.StartPeriod = options.HealthStartPeriod } - if options.HealthStartInterval != 0 { - hc.StartInterval = options.HealthStartInterval - } // If no healthcheck config is set (via CLI or image), return empty string so we skip adding to container config. if reflect.DeepEqual(hc, &healthcheck.Healthcheck{}) { diff --git a/pkg/cmd/container/health_check.go b/pkg/cmd/container/health_check.go index 6fe31c8ebc3..e2646497c3f 100644 --- a/pkg/cmd/container/health_check.go +++ b/pkg/cmd/container/health_check.go @@ -59,7 +59,6 @@ func HealthCheck(ctx context.Context, client *containerd.Client, container conta hcConfig.Interval = timeoutWithDefault(hcConfig.Interval, healthcheck.DefaultProbeInterval) hcConfig.Timeout = timeoutWithDefault(hcConfig.Timeout, healthcheck.DefaultProbeTimeout) hcConfig.StartPeriod = timeoutWithDefault(hcConfig.StartPeriod, healthcheck.DefaultStartPeriod) - hcConfig.StartInterval = timeoutWithDefault(hcConfig.StartInterval, healthcheck.DefaultStartInterval) if hcConfig.Retries == 0 { hcConfig.Retries = healthcheck.DefaultProbeRetries } diff --git a/pkg/cmd/container/kill.go b/pkg/cmd/container/kill.go index 080336d9f87..d42a7cd8c82 100644 --- a/pkg/cmd/container/kill.go +++ b/pkg/cmd/container/kill.go @@ -35,6 +35,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/netutil" @@ -111,6 +112,11 @@ func killContainer(ctx context.Context, container containerd.Container, signal s return err } + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + log.G(ctx).Warnf("failed to clean up healthcheck units for container %s: %s", container.ID(), err) + } + // signal will be sent once resume is finished if paused { if err := task.Resume(ctx); err != nil { diff --git a/pkg/cmd/container/remove.go b/pkg/cmd/container/remove.go index 28048a2f6a1..b9df2b2acaf 100644 --- a/pkg/cmd/container/remove.go +++ b/pkg/cmd/container/remove.go @@ -34,6 +34,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/clientutil" "github.com/containerd/nerdctl/v2/pkg/containerutil" "github.com/containerd/nerdctl/v2/pkg/dnsutil/hostsstore" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" @@ -179,6 +180,11 @@ func RemoveContainer(ctx context.Context, c containerd.Container, globalOptions // Otherwise, nil the error so that we do not write the error label on the container retErr = nil + // Clean up healthcheck systemd units + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, c); err != nil { + log.G(ctx).WithError(err).Warnf("failed to clean up healthcheck units for container %q", id) + } + // Now, delete the actual container var delOpts []containerd.DeleteOpts if _, err := c.Image(ctx); err == nil { diff --git a/pkg/cmd/container/stop.go b/pkg/cmd/container/stop.go index e1f347b6b96..755686e4bd8 100644 --- a/pkg/cmd/container/stop.go +++ b/pkg/cmd/container/stop.go @@ -25,6 +25,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/api/types" "github.com/containerd/nerdctl/v2/pkg/containerutil" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker" ) @@ -39,6 +40,9 @@ func Stop(ctx context.Context, client *containerd.Client, reqs []string, opt typ if err := cleanupNetwork(ctx, found.Container, opt.GOptions); err != nil { return fmt.Errorf("unable to cleanup network for container: %s", found.Req) } + if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, found.Container); err != nil { + return fmt.Errorf("unable to cleanup healthcheck timer for container: %s: %w", found.Req, err) + } if err := containerutil.Stop(ctx, found.Container, opt.Timeout, opt.Signal); err != nil { if errdefs.IsNotFound(err) { fmt.Fprintf(opt.Stderr, "No such container: %s\n", found.Req) diff --git a/pkg/containerutil/containerutil.go b/pkg/containerutil/containerutil.go index 60da6f11895..65ce184c613 100644 --- a/pkg/containerutil/containerutil.go +++ b/pkg/containerutil/containerutil.go @@ -48,6 +48,7 @@ import ( "github.com/containerd/nerdctl/v2/pkg/consoleutil" "github.com/containerd/nerdctl/v2/pkg/errutil" "github.com/containerd/nerdctl/v2/pkg/formatter" + "github.com/containerd/nerdctl/v2/pkg/healthcheck" "github.com/containerd/nerdctl/v2/pkg/ipcutil" "github.com/containerd/nerdctl/v2/pkg/labels" "github.com/containerd/nerdctl/v2/pkg/labels/k8slabels" @@ -286,6 +287,15 @@ func Start(ctx context.Context, container containerd.Container, isAttach bool, i if err := task.Start(ctx); err != nil { return err } + + // If container has health checks configured, create and start systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + if !isAttach { return nil } @@ -349,6 +359,11 @@ func Stop(ctx context.Context, container containerd.Container, timeout *time.Dur } }() + // Clean up healthcheck units if configured. + // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) + // } + if timeout == nil { t, ok := l[labels.StopTimeout] if !ok { @@ -487,6 +502,11 @@ func Pause(ctx context.Context, client *containerd.Client, id string) error { return err } + // Clean up healthcheck units if configured. + // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { + // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) + // } + switch status.Status { case containerd.Paused: return fmt.Errorf("container %s is already paused", id) @@ -514,6 +534,14 @@ func Unpause(ctx context.Context, client *containerd.Client, id string) error { return err } + // Recreate healthcheck related systemd timer/service files. + if err := healthcheck.CreateTimer(ctx, container); err != nil { + return fmt.Errorf("failed to create healthcheck timer: %w", err) + } + if err := healthcheck.StartTimer(ctx, container); err != nil { + return fmt.Errorf("failed to start healthcheck timer: %w", err) + } + switch status.Status { case containerd.Paused: return task.Resume(ctx) diff --git a/pkg/healthcheck/executor.go b/pkg/healthcheck/executor.go index f5c4216b302..d362d988cf8 100644 --- a/pkg/healthcheck/executor.go +++ b/pkg/healthcheck/executor.go @@ -128,26 +128,42 @@ func updateHealthStatus(ctx context.Context, container containerd.Container, hcC currentHealth = &HealthState{ Status: Starting, FailingStreak: 0, + StartPeriod: hcConfig.StartPeriod > 0, } } - // Check if still within start period - startPeriod := hcConfig.StartPeriod + // Get container info for start period check info, err := container.Info(ctx) if err != nil { return fmt.Errorf("failed to get container info: %w", err) } containerCreated := info.CreatedAt - stillInStartPeriod := hcResult.Start.Sub(containerCreated) < startPeriod - - // Update health status based on exit code - if hcResult.ExitCode == 0 { - currentHealth.Status = Healthy - currentHealth.FailingStreak = 0 - } else if !stillInStartPeriod { - currentHealth.FailingStreak++ - if currentHealth.FailingStreak >= hcConfig.Retries { - currentHealth.Status = Unhealthy + + // Check if we're in start period workflow + inStartPeriodTime := hcResult.Start.Sub(containerCreated) < hcConfig.StartPeriod + inStartPeriodState := currentHealth.StartPeriod + + if inStartPeriodTime && inStartPeriodState { + // Start Period Workflow + if hcResult.ExitCode == 0 { + // First healthy result transitions us out of start period + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + currentHealth.StartPeriod = false + } + // Ignore unhealthy results during start period + } else { + // Health Interval Workflow + if hcResult.ExitCode == 0 { + if currentHealth.Status != Healthy { + currentHealth.Status = Healthy + currentHealth.FailingStreak = 0 + } + } else { + currentHealth.FailingStreak++ + if currentHealth.FailingStreak >= hcConfig.Retries && currentHealth.Status != Unhealthy { + currentHealth.Status = Unhealthy + } } } diff --git a/pkg/healthcheck/health.go b/pkg/healthcheck/health.go index 8e0301b492a..b38072e8562 100644 --- a/pkg/healthcheck/health.go +++ b/pkg/healthcheck/health.go @@ -43,7 +43,6 @@ const ( DefaultProbeInterval = 30 * time.Second // Default interval between probe runs. Also applies before the first probe. DefaultProbeTimeout = 30 * time.Second // Max duration a single probe run may take before it's considered failed. DefaultStartPeriod = 0 * time.Second // Grace period for container startup before health checks count as failures. - DefaultStartInterval = 5 * time.Second // Interval between checks during the start period. DefaultProbeRetries = 3 // Number of consecutive failures before marking container as unhealthy. MaxLogEntries = 5 // Maximum number of health check log entries to keep. MaxOutputLenForInspect = 4096 // Max output length (in bytes) stored in health check logs during inspect. Longer outputs are truncated. @@ -70,18 +69,18 @@ type HealthcheckResult struct { // Healthcheck represents the health check configuration type Healthcheck struct { - Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy - Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks - Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung - Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy - StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts - StartInterval time.Duration `json:"StartInterval,omitempty"` // StartInterval is the time between health checks during the start period + Test []string `json:"Test,omitempty"` // Test is the check to perform that the container is healthy + Interval time.Duration `json:"Interval,omitempty"` // Interval is the time to wait between checks + Timeout time.Duration `json:"Timeout,omitempty"` // Timeout is the time to wait before considering the check to have hung + Retries int `json:"Retries,omitempty"` // Retries is the number of consecutive failures needed to consider a container as unhealthy + StartPeriod time.Duration `json:"StartPeriod,omitempty"` // StartPeriod is the period for the container to initialize before the health check starts } // HealthState stores the current health state of a container type HealthState struct { Status HealthStatus // Status is one of [Starting], [Healthy] or [Unhealthy] FailingStreak int // FailingStreak is the number of consecutive failures + StartPeriod bool // StartPeriod indicates if we're in the start period workflow } // ToJSONString serializes HealthState to a JSON string for label storage diff --git a/pkg/healthcheck/healthcheck_manager_darwin.go b/pkg/healthcheck/healthcheck_manager_darwin.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_darwin.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_freebsd.go b/pkg/healthcheck/healthcheck_manager_freebsd.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_freebsd.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go new file mode 100644 index 00000000000..5bfd576b126 --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -0,0 +1,214 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + "fmt" + "math/rand" + "os" + "os/exec" + "strings" + + "github.com/coreos/go-systemd/v22/dbus" + "github.com/sirupsen/logrus" + + containerd "github.com/containerd/containerd/v2/client" + "github.com/containerd/log" + + "github.com/containerd/nerdctl/v2/pkg/defaults" + "github.com/containerd/nerdctl/v2/pkg/labels" + "github.com/containerd/nerdctl/v2/pkg/rootlessutil" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + containerID := container.ID() + hcName := hcUnitName(containerID, true) + logrus.Debugf("Creating healthcheck timer unit: %s", hcName) + + cmd := []string{} + if rootlessutil.IsRootless() { + cmd = append(cmd, "--user") + } + if path := os.Getenv("PATH"); path != "" { + cmd = append(cmd, "--setenv=PATH="+path) + } + + // Always use health-interval for timer frequency + cmd = append(cmd, "--unit", hcName, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s") + + cmd = append(cmd, "nerdctl", "container", "healthcheck", containerID) + if logrus.IsLevelEnabled(logrus.DebugLevel) { + cmd = append(cmd, "--debug") + } + + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + logrus.Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) + run := exec.Command("systemd-run", cmd...) + if out, err := run.CombinedOutput(); err != nil { + return fmt.Errorf("systemd-run failed: %w\noutput: %s", err, strings.TrimSpace(string(out))) + } + + return nil +} + +// StartTimer starts the healthcheck timer unit. +// TODO if we persist hcName to container state, pass that to this function. +func StartTimer(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + hcName := hcUnitName(container.ID(), true) + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + startChan := make(chan string) + unit := hcName + ".service" + if _, err := conn.RestartUnitContext(context.Background(), unit, "fail", startChan); err != nil { + return err + } + if msg := <-startChan; msg != "done" { + return fmt.Errorf("unexpected systemd restart result: %s", msg) + } + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + hc := extractHealthcheck(ctx, container) + if hc == nil { + return nil + } + if shouldSkipHealthCheckSystemd(hc) { + return nil + } + + return RemoveTransientHealthCheckFilesByID(ctx, container.ID()) +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + // Don't proceed if systemd is unavailable or disabled + if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return nil + } + + // Skip healthchecks in rootless environments to avoid systemd DBUS permission issues + if rootlessutil.IsRootless() { + return nil + } + + logrus.Debugf("Removing healthcheck timer unit: %s", containerID) + + conn, err := dbus.NewSystemConnectionContext(context.Background()) + if err != nil { + return fmt.Errorf("systemd DBUS connect error: %w", err) + } + defer conn.Close() + + unitName := hcUnitName(containerID, true) + timer := unitName + ".timer" + service := unitName + ".service" + + // Stop timer + tChan := make(chan string) + if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil { + if msg := <-tChan; msg != "done" { + logrus.Warnf("timer stop message: %s", msg) + } + } + + // Stop service + sChan := make(chan string) + if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil { + if msg := <-sChan; msg != "done" { + logrus.Warnf("service stop message: %s", msg) + } + } + + // Reset failed units + _ = conn.ResetFailedUnitContext(context.Background(), service) + return nil +} + +// hcUnitName returns a systemd unit name for a container healthcheck. +func hcUnitName(containerID string, bare bool) string { + unit := containerID + if !bare { + unit += fmt.Sprintf("-%x", rand.Int()) + } + return unit +} + +func extractHealthcheck(ctx context.Context, container containerd.Container) *Healthcheck { + l, err := container.Labels(ctx) + if err != nil { + log.G(ctx).WithError(err).Debugf("could not get labels for container %s", container.ID()) + return nil + } + hcStr, ok := l[labels.HealthCheck] + if !ok || hcStr == "" { + return nil + } + hc, err := HealthCheckFromJSON(hcStr) + if err != nil { + log.G(ctx).WithError(err).Debugf("invalid healthcheck config on container %s", container.ID()) + return nil + } + return hc +} + +// shouldSkipHealthCheckSystemd determines if healthcheck timers should be skipped. +func shouldSkipHealthCheckSystemd(hc *Healthcheck) bool { + // Don't proceed if systemd is unavailable or disabled + if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { + return true + } + + // Skip healthchecks in rootless environments to avoid systemd DBUS permission issues + if rootlessutil.IsRootless() { + return true + } + + // Don't proceed if health check is nil, empty, explicitly NONE or interval is 0. + if hc == nil || len(hc.Test) == 0 || hc.Test[0] == "NONE" || hc.Interval == 0 { + return true + } + return false +} diff --git a/pkg/healthcheck/healthcheck_manager_windows.go b/pkg/healthcheck/healthcheck_manager_windows.go new file mode 100644 index 00000000000..63085d4feae --- /dev/null +++ b/pkg/healthcheck/healthcheck_manager_windows.go @@ -0,0 +1,43 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package healthcheck + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" +) + +// CreateTimer sets up the transient systemd timer and service for healthchecks. +func CreateTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// StartTimer starts the healthcheck timer unit. +func StartTimer(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFiles stops and cleans up the transient timer and service. +func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.Container) error { + return nil +} + +// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index 89b6c6b1410..86d4a826f7b 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -560,8 +560,11 @@ func onCreateRuntime(opts *handlerOpts) error { } func onPostStop(opts *handlerOpts) error { + log.L.Debugf("onPostStop hook triggered for container %s (namespace: %s)", opts.state.ID, opts.state.Annotations[labels.Namespace]) + lf, err := state.New(opts.state.Annotations[labels.StateDir]) if err != nil { + log.L.WithError(err).Errorf("failed to create state store for container %s", opts.state.ID) return err } @@ -585,6 +588,7 @@ func onPostStop(opts *handlerOpts) error { ctx := context.Background() ns := opts.state.Annotations[labels.Namespace] + if opts.cni != nil { var err error b4nnEnabled, b4nnBindEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(opts.state.Annotations) From e940e1e66715ddc2391efcb728a09d38533d0f8d Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Wed, 27 Aug 2025 18:17:58 +0000 Subject: [PATCH 2/7] Update documentation Signed-off-by: Arjun Raja Yogidas --- .../container_health_check_linux_test.go | 50 +++++++++--- docs/healthchecks.md | 79 ++++++++++++++----- go.mod | 2 +- pkg/containerutil/containerutil.go | 10 --- pkg/healthcheck/healthcheck_manager_linux.go | 27 ++----- 5 files changed, 106 insertions(+), 62 deletions(-) diff --git a/cmd/nerdctl/container/container_health_check_linux_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go index 48c84618fcb..8c71fb31c51 100644 --- a/cmd/nerdctl/container/container_health_check_linux_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -19,6 +19,7 @@ package container import ( "encoding/json" "errors" + "fmt" "strings" "testing" "time" @@ -37,9 +38,6 @@ import ( ) func TestContainerHealthCheckBasic(t *testing.T) { - if rootlessutil.IsRootless() { - t.Skip("healthcheck tests are skipped in rootless environment") - } testCase := nerdtest.Setup() @@ -138,10 +136,6 @@ func TestContainerHealthCheckBasic(t *testing.T) { } func TestContainerHealthCheckAdvance(t *testing.T) { - if rootlessutil.IsRootless() { - t.Skip("healthcheck tests are skipped in rootless environment") - } - testCase := nerdtest.Setup() // Docker CLI does not provide a standalone healthcheck command. @@ -399,6 +393,43 @@ func TestContainerHealthCheckAdvance(t *testing.T) { } }, }, + { + Description: "Healthcheck emits large output repeatedly", + Setup: func(data test.Data, helpers test.Helpers) { + helpers.Ensure("run", "-d", "--name", data.Identifier(), + "--health-cmd", "yes X | head -c 60000", + "--health-interval", "1s", "--health-timeout", "2s", + testutil.CommonImage, "sleep", nerdtest.Infinity) + nerdtest.EnsureContainerStarted(helpers, data.Identifier()) + }, + Cleanup: func(data test.Data, helpers test.Helpers) { + helpers.Anyhow("rm", "-f", data.Identifier()) + }, + Command: func(data test.Data, helpers test.Helpers) test.TestableCommand { + for i := 0; i < 3; i++ { + helpers.Ensure("container", "healthcheck", data.Identifier()) + time.Sleep(2 * time.Second) + } + return helpers.Command("inspect", data.Identifier()) + }, + Expected: func(data test.Data, helpers test.Helpers) *test.Expected { + return &test.Expected{ + ExitCode: 0, + Output: expect.All(func(_ string, t tig.T) { + inspect := nerdtest.InspectContainer(helpers, data.Identifier()) + h := inspect.State.Health + debug, _ := json.MarshalIndent(h, "", " ") + t.Log(string(debug)) + assert.Assert(t, h != nil, "expected health state") + assert.Equal(t, h.Status, healthcheck.Healthy) + assert.Assert(t, len(h.Log) >= 3, "expected at least 3 health log entries") + for _, log := range h.Log { + assert.Assert(t, len(log.Output) >= 1024, fmt.Sprintf("each output should be >= 1024 bytes, was: %s", log.Output)) + } + }), + } + }, + }, { Description: "Health log in inspect keeps only the latest 5 entries", Setup: func(data test.Data, helpers test.Helpers) { @@ -587,13 +618,10 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { "--health-interval", "2s", testutil.CommonImage, "sleep", "30") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - // Wait for a healthcheck to execute - time.Sleep(2 * time.Second) }, Cleanup: func(data test.Data, helpers test.Helpers) { // Ensure proper cleanup of systemd units helpers.Anyhow("stop", data.Identifier()) - time.Sleep(500 * time.Millisecond) // Allow systemd cleanup helpers.Anyhow("rm", "-f", data.Identifier()) }, Expected: func(data test.Data, helpers test.Helpers) *test.Expected { @@ -617,9 +645,7 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) { "--health-interval", "1s", testutil.CommonImage, "sleep", "30") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - time.Sleep(2 * time.Second) // Wait for at least one health check to execute helpers.Ensure("kill", data.Identifier()) // Kill the container - time.Sleep(3 * time.Second) // Wait to allow any potential extra healthchecks (shouldn't happen) }, Cleanup: func(data test.Data, helpers test.Helpers) { // Container is already killed, just remove it diff --git a/docs/healthchecks.md b/docs/healthchecks.md index b47c92748b5..3bf78810b8d 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -2,25 +2,29 @@ `nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. -Currently, health checks can be triggered manually using the nerdctl container healthcheck command. Automatic orchestration (e.g., periodic checks) will be added in a future update. +## Configuration Options Health checks can be configured in multiple ways: -1. At container creation time using nerdctl run or nerdctl create with `--health-*` flags +1. At container creation time using `nerdctl run` or `nerdctl create` with these flags: + - `--health-cmd`: Command to run to check health + - `--health-interval`: Time between running the check (default: 30s) + - `--health-timeout`: Maximum time to allow one check to run (default: 30s) + - `--health-retries`: Consecutive failures needed to report unhealthy (default: 3) + - `--health-start-period`: Start period for the container to initialize before starting health-retries countdown + - `--no-healthcheck`: Disable any container-specified HEALTHCHECK + 2. At image build time using HEALTHCHECK in a Dockerfile -3. In docker-compose.yaml files, if using nerdctl compose -When a container is created, nerdctl determines the health check configuration based on the following priority: +**Note:** The `--health-start-interval` option is currently not supported by nerdctl. -1. **CLI flags** take highest precedence (e.g., `--health-cmd`, etc.) -2. If no CLI flags are set, nerdctl will use any health check defined in the image. -3. If neither is present, no health check will be configured +## Configuration Priority -Example: +When a container is created, nerdctl determines the health check configuration based on this priority: -```bash -nerdctl run --name web --health-cmd="curl -f http://localhost || exit 1" --health-interval=30s --health-timeout=5s --health-retries=3 nginx -``` +1. CLI flags take highest precedence (e.g., `--health-cmd`, etc.) +2. If no CLI flags are set, nerdctl will use any health check defined in the image +3. If neither is present, no health check will be configured ### Disabling Health Checks @@ -37,15 +41,54 @@ configured health check inside the container and reports the result. It serves a health checks, especially in scenarios where external scheduling is used. Example: - ``` nerdctl container healthcheck ``` -### Future Work (WIP) +## Automatic Health Checks with systemd + +On Linux systems with systemd, nerdctl automatically creates and manages systemd timer units to execute health checks at the configured intervals. This provides reliable scheduling and execution of health checks without requiring a persistent daemon. + +### Requirements for Automatic Health Checks + +- systemd must be available on the system +- Container must not be running in rootless mode +- Environment variable `DISABLE_HC_SYSTEMD` must not be set to "true" + +### How It Works -Since nerdctl is daemonless and does not have a persistent background process, we rely on systemd(or external schedulers) -to invoke nerdctl container healthcheck at configured intervals. This allows periodic health checks for containers in a -systemd-based environment. We are actively working on automating health checks, where we will listen to container lifecycle -events and generate appropriate systemd service and timer units. This will enable nerdctl to support automated, -Docker-compatible health checks by leveraging systemd for scheduling and lifecycle integration. \ No newline at end of file +1. When a container with health checks is created, nerdctl: + - Creates a systemd timer unit for the container + - Configures the timer according to the health check interval + - Starts monitoring the container's health status + +2. The health check status can be one of: + - `starting`: During container initialization + - `healthy`: When health checks are passing + - `unhealthy`: After specified number of consecutive failures +## Examples + +1. Basic health check that verifies a web server: +```bash +nerdctl run -d --name web \ + --health-cmd="curl -f http://localhost/ || exit 1" \ + --health-interval=5s \ + --health-retries=3 \ + nginx +``` + +2. Health check with initialization period: +```bash +nerdctl run -d --name app \ + --health-cmd="./health-check.sh" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + --health-start-period=60s \ + myapp +``` + +3. Disable health checks: +```bash +nerdctl run --no-healthcheck myapp +``` diff --git a/go.mod b/go.mod index 23c621b97d1..6d554b021f3 100644 --- a/go.mod +++ b/go.mod @@ -120,7 +120,7 @@ require ( github.com/santhosh-tekuri/jsonschema/v6 v6.0.1 // indirect github.com/sasha-s/go-deadlock v0.3.5 // indirect //gomodjail:unconfined - github.com/sirupsen/logrus v1.9.3 + github.com/sirupsen/logrus v1.9.3 // indirect github.com/smallstep/pkcs7 v0.1.1 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 // indirect diff --git a/pkg/containerutil/containerutil.go b/pkg/containerutil/containerutil.go index 65ce184c613..73c7b1a4c35 100644 --- a/pkg/containerutil/containerutil.go +++ b/pkg/containerutil/containerutil.go @@ -359,11 +359,6 @@ func Stop(ctx context.Context, container containerd.Container, timeout *time.Dur } }() - // Clean up healthcheck units if configured. - // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { - // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) - // } - if timeout == nil { t, ok := l[labels.StopTimeout] if !ok { @@ -502,11 +497,6 @@ func Pause(ctx context.Context, client *containerd.Client, id string) error { return err } - // Clean up healthcheck units if configured. - // if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil { - // return fmt.Errorf("failed to clean up healthcheck units for container %s", container.ID()) - // } - switch status.Status { case containerd.Paused: return fmt.Errorf("container %s is already paused", id) diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go index 5bfd576b126..ba0a7dbbf11 100644 --- a/pkg/healthcheck/healthcheck_manager_linux.go +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -25,7 +25,6 @@ import ( "strings" "github.com/coreos/go-systemd/v22/dbus" - "github.com/sirupsen/logrus" containerd "github.com/containerd/containerd/v2/client" "github.com/containerd/log" @@ -47,12 +46,9 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { containerID := container.ID() hcName := hcUnitName(containerID, true) - logrus.Debugf("Creating healthcheck timer unit: %s", hcName) + log.G(ctx).Debugf("Creating healthcheck timer unit: %s", hcName) cmd := []string{} - if rootlessutil.IsRootless() { - cmd = append(cmd, "--user") - } if path := os.Getenv("PATH"); path != "" { cmd = append(cmd, "--setenv=PATH="+path) } @@ -61,7 +57,7 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { cmd = append(cmd, "--unit", hcName, "--on-unit-inactive="+hc.Interval.String(), "--timer-property=AccuracySec=1s") cmd = append(cmd, "nerdctl", "container", "healthcheck", containerID) - if logrus.IsLevelEnabled(logrus.DebugLevel) { + if log.G(ctx).Logger.IsLevelEnabled(log.DebugLevel) { cmd = append(cmd, "--debug") } @@ -71,7 +67,7 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { } defer conn.Close() - logrus.Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) + log.G(ctx).Debugf("creating healthcheck timer with: systemd-run %s", strings.Join(cmd, " ")) run := exec.Command("systemd-run", cmd...) if out, err := run.CombinedOutput(); err != nil { return fmt.Errorf("systemd-run failed: %w\noutput: %s", err, strings.TrimSpace(string(out))) @@ -81,7 +77,6 @@ func CreateTimer(ctx context.Context, container containerd.Container) error { } // StartTimer starts the healthcheck timer unit. -// TODO if we persist hcName to container state, pass that to this function. func StartTimer(ctx context.Context, container containerd.Container) error { hc := extractHealthcheck(ctx, container) if hc == nil { @@ -124,17 +119,7 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { - // Don't proceed if systemd is unavailable or disabled - if !defaults.IsSystemdAvailable() || os.Getenv("DISABLE_HC_SYSTEMD") == "true" { - return nil - } - - // Skip healthchecks in rootless environments to avoid systemd DBUS permission issues - if rootlessutil.IsRootless() { - return nil - } - - logrus.Debugf("Removing healthcheck timer unit: %s", containerID) + log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID) conn, err := dbus.NewSystemConnectionContext(context.Background()) if err != nil { @@ -150,7 +135,7 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string tChan := make(chan string) if _, err := conn.StopUnitContext(context.Background(), timer, "ignore-dependencies", tChan); err == nil { if msg := <-tChan; msg != "done" { - logrus.Warnf("timer stop message: %s", msg) + log.G(ctx).Warnf("timer stop message: %s", msg) } } @@ -158,7 +143,7 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string sChan := make(chan string) if _, err := conn.StopUnitContext(context.Background(), service, "ignore-dependencies", sChan); err == nil { if msg := <-sChan; msg != "done" { - logrus.Warnf("service stop message: %s", msg) + log.G(ctx).Warnf("service stop message: %s", msg) } } From fe75a0fcb06df25933026a7a0f3bf112d948c632 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:53:49 +0000 Subject: [PATCH 3/7] build(deps): bump github.com/spf13/cobra from 1.9.1 to 1.10.1 Bumps [github.com/spf13/cobra](https://github.com/spf13/cobra) from 1.9.1 to 1.10.1. - [Release notes](https://github.com/spf13/cobra/releases) - [Commits](https://github.com/spf13/cobra/compare/v1.9.1...v1.10.1) --- updated-dependencies: - dependency-name: github.com/spf13/cobra dependency-version: 1.10.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 4 ++-- go.sum | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/go.mod b/go.mod index 6d554b021f3..e4cddc34312 100644 --- a/go.mod +++ b/go.mod @@ -56,8 +56,8 @@ require ( github.com/pelletier/go-toml/v2 v2.2.4 github.com/rootless-containers/bypass4netns v0.4.2 //gomodjail:unconfined github.com/rootless-containers/rootlesskit/v2 v2.3.5 //gomodjail:unconfined - github.com/spf13/cobra v1.9.1 //gomodjail:unconfined - github.com/spf13/pflag v1.0.7 //gomodjail:unconfined + github.com/spf13/cobra v1.10.1 //gomodjail:unconfined + github.com/spf13/pflag v1.0.9 //gomodjail:unconfined github.com/vishvananda/netlink v1.3.1 //gomodjail:unconfined github.com/vishvananda/netns v0.0.5 //gomodjail:unconfined github.com/yuchanns/srslog v1.1.0 diff --git a/go.sum b/go.sum index eba472458ec..05fb4fca517 100644 --- a/go.sum +++ b/go.sum @@ -286,11 +286,10 @@ github.com/smallstep/pkcs7 v0.1.1 h1:x+rPdt2W088V9Vkjho4KtoggyktZJlMduZAtRHm68LU github.com/smallstep/pkcs7 v0.1.1/go.mod h1:dL6j5AIz9GHjVEBTXtW+QliALcgM19RtXaTeyxI+AfA= github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= -github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= -github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= -github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= +github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6 h1:pnnLyeX7o/5aX8qUQ69P/mLojDqwda8hFOCBTmP/6hw= github.com/stefanberger/go-pkcs11uri v0.0.0-20230803200340-78284954bff6/go.mod h1:39R/xuhNgVhi+K0/zst4TLrJrVmbm6LVgl4A0+ZFS5M= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= From 1accbf302c0c86431e9e9b668a961c100a247bae Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:53:49 +0000 Subject: [PATCH 4/7] build(deps): bump github.com/containernetworking/plugins Bumps [github.com/containernetworking/plugins](https://github.com/containernetworking/plugins) from 1.7.1 to 1.8.0. - [Release notes](https://github.com/containernetworking/plugins/releases) - [Commits](https://github.com/containernetworking/plugins/compare/v1.7.1...v1.8.0) --- updated-dependencies: - dependency-name: github.com/containernetworking/plugins dependency-version: 1.8.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 8 ++++---- go.sum | 24 ++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/go.mod b/go.mod index e4cddc34312..e5020ce8048 100644 --- a/go.mod +++ b/go.mod @@ -1,7 +1,7 @@ //gomodjail:confined module github.com/containerd/nerdctl/v2 -go 1.24.0 +go 1.24.2 require ( github.com/Masterminds/semver/v3 v3.4.0 @@ -27,7 +27,7 @@ require ( github.com/containerd/stargz-snapshotter/ipfs v0.17.0 //gomodjail:unconfined github.com/containerd/typeurl/v2 v2.2.3 github.com/containernetworking/cni v1.3.0 //gomodjail:unconfined - github.com/containernetworking/plugins v1.7.1 //gomodjail:unconfined + github.com/containernetworking/plugins v1.8.0 //gomodjail:unconfined github.com/coreos/go-iptables v0.8.0 //gomodjail:unconfined github.com/coreos/go-systemd/v22 v22.6.0 github.com/cyphar/filepath-securejoin v0.4.1 //gomodjail:unconfined @@ -86,7 +86,7 @@ require ( github.com/docker/docker-credential-helpers v0.8.2 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-jose/go-jose/v4 v4.0.5 // indirect - github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -141,7 +141,7 @@ require ( //gomodjail:unconfined google.golang.org/grpc v1.73.0 // indirect //gomodjail:unconfined - google.golang.org/protobuf v1.36.6 // indirect + google.golang.org/protobuf v1.36.7 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect lukechampine.com/blake3 v1.3.0 // indirect sigs.k8s.io/yaml v1.4.0 // indirect diff --git a/go.sum b/go.sum index 05fb4fca517..ab8acb21aa6 100644 --- a/go.sum +++ b/go.sum @@ -65,8 +65,8 @@ github.com/containerd/typeurl/v2 v2.2.3 h1:yNA/94zxWdvYACdYO8zofhrTVuQY73fFU1y++ github.com/containerd/typeurl/v2 v2.2.3/go.mod h1:95ljDnPfD3bAbDJRugOiShd/DlAAsxGtUBhJxIn7SCk= github.com/containernetworking/cni v1.3.0 h1:v6EpN8RznAZj9765HhXQrtXgX+ECGebEYEmnuFjskwo= github.com/containernetworking/cni v1.3.0/go.mod h1:Bs8glZjjFfGPHMw6hQu82RUgEPNGEaBb9KS5KtNMnJ4= -github.com/containernetworking/plugins v1.7.1 h1:CNAR0jviDj6FS5Vg85NTgKWLDzZPfi/lj+VJfhMDTIs= -github.com/containernetworking/plugins v1.7.1/go.mod h1:xuMdjuio+a1oVQsHKjr/mgzuZ24leAsqUYRnzGoXHy0= +github.com/containernetworking/plugins v1.8.0 h1:WjGbV/0UQyo8A4qBsAh6GaDAtu1hevxVxsEuqtBqUFk= +github.com/containernetworking/plugins v1.8.0/go.mod h1:JG3BxoJifxxHBhG3hFyxyhid7JgRVBu/wtooGEvWf1c= github.com/containers/ocicrypt v1.2.1 h1:0qIOTT9DoYwcKmxSt8QJt+VzMY18onl9jUXsxpVhSmM= github.com/containers/ocicrypt v1.2.1/go.mod h1:aD0AAqfMp0MtwqWgHM1bUwe1anx0VazI108CRrSKINQ= github.com/coreos/go-iptables v0.8.0 h1:MPc2P89IhuVpLI7ETL/2tx3XZ61VeICZjYqDEgNsPRc= @@ -115,8 +115,8 @@ github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8 github.com/go-jose/go-jose/v4 v4.0.5 h1:M6T8+mKZl/+fNNuFHvGIzDz7BTLQPIounk/b9dw3AaE= github.com/go-jose/go-jose/v4 v4.0.5/go.mod h1:s3P1lRrkT8igV8D9OjyL4WRyHvjB6a4JSllnOrmmBOA= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= @@ -155,8 +155,8 @@ github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= -github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 h1:EEHtgt9IwisQ2AZ4pIsMjahcegHh6rmhqxzIRQIyepY= +github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -238,10 +238,10 @@ github.com/multiformats/go-multihash v0.2.3 h1:7Lyc8XfX/IY2jWb/gI7JP+o7JEq9hOa7B github.com/multiformats/go-multihash v0.2.3/go.mod h1:dXgKXCXjBzdscBLk9JkjINiEsCKRVch90MdaGiKsvSM= github.com/multiformats/go-varint v0.0.7 h1:sWSGR+f/eu5ABZA2ZpYKBILXTTs9JWpdEM/nEGOHFS8= github.com/multiformats/go-varint v0.0.7/go.mod h1:r8PUYw/fD/SjBCiKOoDlGF6QawOELpZAu9eioSos/OU= -github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= -github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= -github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= +github.com/onsi/ginkgo/v2 v2.25.1 h1:Fwp6crTREKM+oA6Cz4MsO8RhKQzs2/gOIVOUscMAfZY= +github.com/onsi/ginkgo/v2 v2.25.1/go.mod h1:ppTWQ1dh9KM/F1XgpeRqelR+zHVwV81DGRSDnFxK7Sk= +github.com/onsi/gomega v1.38.1 h1:FaLA8GlcpXDwsb7m0h2A9ew2aTk3vnZMlzFgg5tz/pk= +github.com/onsi/gomega v1.38.1/go.mod h1:LfcV8wZLvwcYRwPiJysphKAEsmcFnLMK/9c+PjvlX8g= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= @@ -499,8 +499,8 @@ google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2 google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= +google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From 7b5d8ef52a41a5e543e1e301705430fec8695eeb Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Wed, 27 Aug 2025 18:17:58 +0000 Subject: [PATCH 5/7] Update documentation Signed-off-by: Arjun Raja Yogidas --- docs/healthchecks.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/healthchecks.md b/docs/healthchecks.md index 3bf78810b8d..0da55d355c5 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -1,6 +1,6 @@ # Health Check Support in nerdctl -`nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. +`nerdctl` supports Docker-compatible health checks for containers, allowing you to monitor container health through user-defined commands. ## Configuration Options @@ -31,7 +31,13 @@ When a container is created, nerdctl determines the health check configuration b You can disable health checks using the following flag during container create/run: ```bash ---no-healthcheck +nerdctl run -d --name app \ + --health-cmd="./health-check.sh" \ + --health-interval=30s \ + --health-timeout=10s \ + --health-retries=3 \ + --health-start-period=60s \ + myapp ``` ### Running Health Checks Manually From d035a7fafea009d7c479f9deaece720e843cb8a6 Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Mon, 1 Sep 2025 06:14:59 +0000 Subject: [PATCH 6/7] Address PR comments Signed-off-by: Arjun Raja Yogidas --- docs/healthchecks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/healthchecks.md b/docs/healthchecks.md index 0da55d355c5..933116e3941 100644 --- a/docs/healthchecks.md +++ b/docs/healthchecks.md @@ -1,6 +1,6 @@ # Health Check Support in nerdctl -`nerdctl` supports Docker-compatible health checks for containers, allowing you to monitor container health through user-defined commands. +`nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command. ## Configuration Options From 98cf1c8b83df7eedad3edb0bc87f51725d1fb27d Mon Sep 17 00:00:00 2001 From: Arjun Raja Yogidas Date: Thu, 4 Sep 2025 06:41:25 +0000 Subject: [PATCH 7/7] add non blocking systemd removal Signed-off-by: Arjun Raja Yogidas --- .../container_health_check_linux_test.go | 6 - pkg/healthcheck/healthcheck_manager_darwin.go | 10 ++ pkg/healthcheck/healthcheck_manager_linux.go | 122 +++++++++++++++++- .../healthcheck_manager_windows.go | 10 ++ 4 files changed, 138 insertions(+), 10 deletions(-) diff --git a/cmd/nerdctl/container/container_health_check_linux_test.go b/cmd/nerdctl/container/container_health_check_linux_test.go index 8c71fb31c51..f6dc9678b66 100644 --- a/cmd/nerdctl/container/container_health_check_linux_test.go +++ b/cmd/nerdctl/container/container_health_check_linux_test.go @@ -695,8 +695,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { "--health-interval", "1s", testutil.CommonImage, "sleep", "30") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - // Wait longer for systemd timer creation and first healthcheck execution - time.Sleep(3 * time.Second) }, Cleanup: func(data test.Data, helpers test.Helpers) { helpers.Anyhow("rm", "-f", data.Identifier()) @@ -724,7 +722,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { }) // Stop container and verify cleanup helpers.Ensure("stop", data.Identifier()) - time.Sleep(500 * time.Millisecond) // Allow cleanup to complete // Check that timer is gone result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") @@ -748,7 +745,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { "--health-interval", "2s", testutil.CommonImage, "sleep", "60") nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - time.Sleep(3 * time.Second) // Wait for initial timer creation }, Cleanup: func(data test.Data, helpers test.Helpers) { helpers.Anyhow("rm", "-f", data.Identifier()) @@ -770,7 +766,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { // Step 2: Stop container helpers.Ensure("stop", data.Identifier()) - time.Sleep(1 * time.Second) // Allow cleanup // Step 3: Verify timer is removed after stop result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") @@ -785,7 +780,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) { // Step 4: Restart container helpers.Ensure("start", data.Identifier()) nerdtest.EnsureContainerStarted(helpers, data.Identifier()) - time.Sleep(3 * time.Second) // Wait for timer recreation // Step 5: Verify timer is recreated after restart - this is our final verification return helpers.Custom("systemctl", "list-timers", "--all", "--no-pager") diff --git a/pkg/healthcheck/healthcheck_manager_darwin.go b/pkg/healthcheck/healthcheck_manager_darwin.go index 63085d4feae..46858c4966e 100644 --- a/pkg/healthcheck/healthcheck_manager_darwin.go +++ b/pkg/healthcheck/healthcheck_manager_darwin.go @@ -38,6 +38,16 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C } // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { return nil } +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. On Darwin, this is a no-op since systemd is not available. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + return nil +} diff --git a/pkg/healthcheck/healthcheck_manager_linux.go b/pkg/healthcheck/healthcheck_manager_linux.go index ba0a7dbbf11..71024e11a13 100644 --- a/pkg/healthcheck/healthcheck_manager_linux.go +++ b/pkg/healthcheck/healthcheck_manager_linux.go @@ -23,6 +23,7 @@ import ( "os" "os/exec" "strings" + "time" "github.com/coreos/go-systemd/v22/dbus" @@ -110,14 +111,13 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C if hc == nil { return nil } - if shouldSkipHealthCheckSystemd(hc) { - return nil - } - return RemoveTransientHealthCheckFilesByID(ctx, container.ID()) + return ForceRemoveTransientHealthCheckFiles(ctx, container.ID()) } // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { log.G(ctx).Debugf("Removing healthcheck timer unit: %s", containerID) @@ -151,6 +151,120 @@ func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string _ = conn.ResetFailedUnitContext(context.Background(), service) return nil } +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. It logs errors as warnings but continues cleanup attempts. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + log.G(ctx).Debugf("Force removing healthcheck timer unit: %s", containerID) + + // Create a timeout context for systemd operations (5 seconds default) + timeoutCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + unitName := hcUnitName(containerID, true) + timer := unitName + ".timer" + service := unitName + ".service" + + // Channel to collect any critical errors (though we'll continue cleanup regardless) + errChan := make(chan error, 3) + + // Goroutine for DBUS connection and cleanup operations + go func() { + defer close(errChan) + + conn, err := dbus.NewSystemConnectionContext(timeoutCtx) + if err != nil { + log.G(ctx).Warnf("systemd DBUS connect error during force cleanup: %v", err) + errChan <- fmt.Errorf("systemd DBUS connect error: %w", err) + return + } + defer conn.Close() + + // Stop timer with timeout + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout stopping timer %s during force cleanup", timer) + return + default: + tChan := make(chan string, 1) + if _, err := conn.StopUnitContext(timeoutCtx, timer, "ignore-dependencies", tChan); err == nil { + select { + case msg := <-tChan: + if msg != "done" { + log.G(ctx).Warnf("timer stop message during force cleanup: %s", msg) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout waiting for timer stop confirmation: %s", timer) + } + } else { + log.G(ctx).Warnf("failed to stop timer %s during force cleanup: %v", timer, err) + } + } + }() + + // Stop service with timeout + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout stopping service %s during force cleanup", service) + return + default: + sChan := make(chan string, 1) + if _, err := conn.StopUnitContext(timeoutCtx, service, "ignore-dependencies", sChan); err == nil { + select { + case msg := <-sChan: + if msg != "done" { + log.G(ctx).Warnf("service stop message during force cleanup: %s", msg) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout waiting for service stop confirmation: %s", service) + } + } else { + log.G(ctx).Warnf("failed to stop service %s during force cleanup: %v", service, err) + } + } + }() + + // Reset failed units (best effort, non-blocking) + go func() { + select { + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("timeout resetting failed unit %s during force cleanup", service) + return + default: + if err := conn.ResetFailedUnitContext(timeoutCtx, service); err != nil { + log.G(ctx).Warnf("failed to reset failed unit %s during force cleanup: %v", service, err) + } + } + }() + + // Wait a short time for operations to complete, but don't block indefinitely + select { + case <-time.After(3 * time.Second): + log.G(ctx).Debugf("force cleanup operations completed for container %s", containerID) + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("force cleanup timed out for container %s", containerID) + } + }() + + // Wait for the cleanup goroutine to finish or timeout + select { + case err := <-errChan: + if err != nil { + log.G(ctx).Warnf("force cleanup encountered errors but continuing: %v", err) + } + case <-timeoutCtx.Done(): + log.G(ctx).Warnf("force cleanup timed out for container %s, but cleanup may continue in background", containerID) + } + + // Always return nil - this function should never block the caller + // even if systemd operations fail or timeout + log.G(ctx).Debugf("force cleanup completed (non-blocking) for container %s", containerID) + return nil +} // hcUnitName returns a systemd unit name for a container healthcheck. func hcUnitName(containerID string, bare bool) string { diff --git a/pkg/healthcheck/healthcheck_manager_windows.go b/pkg/healthcheck/healthcheck_manager_windows.go index 63085d4feae..d18c3bce72b 100644 --- a/pkg/healthcheck/healthcheck_manager_windows.go +++ b/pkg/healthcheck/healthcheck_manager_windows.go @@ -38,6 +38,16 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C } // RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID. +// This function is deprecated and no longer used. Use ForceRemoveTransientHealthCheckFiles instead. +/* func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error { return nil } +*/ + +// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service +// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging +// on systemd operations. On Windows, this is a no-op since systemd is not available. +func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error { + return nil +}