Skip to content

Commit 0e46597

Browse files
committed
add non blocking systemd removal
Signed-off-by: Arjun Raja Yogidas <[email protected]>
1 parent b58cade commit 0e46597

File tree

6 files changed

+257
-62
lines changed

6 files changed

+257
-62
lines changed

cmd/nerdctl/container/container_health_check_linux_test.go

Lines changed: 135 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,10 @@ func TestContainerHealthCheckAdvance(t *testing.T) {
608608
func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) {
609609
testCase := nerdtest.Setup()
610610
testCase.Require = require.Not(nerdtest.Docker)
611+
// Skip systemd tests in rootless environment to bypass dbus permission issues
612+
if rootlessutil.IsRootless() {
613+
t.Skip("systemd healthcheck tests are skipped in rootless environment")
614+
}
611615

612616
testCase.SubTests = []*test.Case{
613617
{
@@ -628,35 +632,72 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) {
628632
return &test.Expected{
629633
ExitCode: 0,
630634
Output: expect.All(func(stdout string, t tig.T) {
631-
inspect := nerdtest.InspectContainer(helpers, data.Identifier())
632-
h := inspect.State.Health
633-
assert.Assert(t, h != nil, "expected health state to be present")
634-
assert.Equal(t, h.Status, "healthy")
635+
var h *healthcheck.Health
636+
637+
// Poll up to 5 times for health status
638+
maxAttempts := 5
639+
var finalStatus string
640+
641+
for i := 0; i < maxAttempts; i++ {
642+
inspect := nerdtest.InspectContainer(helpers, data.Identifier())
643+
h = inspect.State.Health
644+
645+
assert.Assert(t, h != nil, "expected health state to be present")
646+
finalStatus = h.Status
647+
648+
// If healthy, break and pass the test
649+
if finalStatus == "healthy" {
650+
t.Log(fmt.Sprintf("Container became healthy on attempt %d/%d", i+1, maxAttempts))
651+
break
652+
}
653+
654+
// If unhealthy, fail immediately
655+
if finalStatus == "unhealthy" {
656+
assert.Assert(t, false, fmt.Sprintf("Container became unhealthy on attempt %d/%d, status: %s", i+1, maxAttempts, finalStatus))
657+
return
658+
}
659+
660+
// If not the last attempt, wait before retrying
661+
if i < maxAttempts-1 {
662+
t.Log(fmt.Sprintf("Attempt %d/%d: status is '%s', waiting 1 second before retry", i+1, maxAttempts, finalStatus))
663+
time.Sleep(1 * time.Second)
664+
}
665+
}
666+
667+
if finalStatus != "healthy" {
668+
assert.Assert(t, false, fmt.Sprintf("Container did not become healthy after %d attempts, final status: %s", maxAttempts, finalStatus))
669+
return
670+
}
671+
635672
assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry")
636673
}),
637674
}
638675
},
639676
},
640677
{
641-
Description: "Kill stops healthcheck execution",
678+
Description: "Kill stops healthcheck execution and cleans up systemd timer",
642679
Setup: func(data test.Data, helpers test.Helpers) {
643680
helpers.Ensure("run", "-d", "--name", data.Identifier(),
644681
"--health-cmd", "echo healthy",
645682
"--health-interval", "1s",
646683
testutil.CommonImage, "sleep", "30")
647684
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
648-
helpers.Ensure("kill", data.Identifier()) // Kill the container
685+
helpers.Ensure("kill", data.Identifier())
649686
},
650687
Cleanup: func(data test.Data, helpers test.Helpers) {
651688
// Container is already killed, just remove it
652689
helpers.Anyhow("rm", "-f", data.Identifier())
653690
},
654691
Expected: func(data test.Data, helpers test.Helpers) *test.Expected {
655692
return &test.Expected{
656-
ExitCode: 0,
657-
Output: expect.All(func(stdout string, t tig.T) {
693+
ExitCode: expect.ExitCodeNoCheck,
694+
Output: func(stdout string, t tig.T) {
695+
// Get container info for verification
658696
inspect := nerdtest.InspectContainer(helpers, data.Identifier())
697+
containerID := inspect.ID
659698
h := inspect.State.Health
699+
700+
// Verify health state and logs exist
660701
assert.Assert(t, h != nil, "expected health state to be present")
661702
assert.Assert(t, len(h.Log) > 0, "expected at least one health check log entry")
662703

@@ -666,10 +707,90 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) {
666707

667708
// Assert all healthcheck log start times are before container finished
668709
for _, entry := range h.Log {
669-
assert.NilError(t, err, "parsing healthcheck Start time")
670710
assert.Assert(t, entry.Start.Before(containerEnd), "healthcheck ran after container was killed")
671711
}
672-
}),
712+
713+
// Ensure systemd timers are removed
714+
result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")
715+
result.Run(&test.Expected{
716+
ExitCode: expect.ExitCodeNoCheck,
717+
Output: func(stdout string, _ tig.T) {
718+
assert.Assert(t, !strings.Contains(stdout, containerID),
719+
"expected nerdctl healthcheck timer for container ID %s to be removed after container stop", containerID)
720+
},
721+
})
722+
},
723+
}
724+
},
725+
},
726+
{
727+
Description: "Remove cleans up systemd timer",
728+
Setup: func(data test.Data, helpers test.Helpers) {
729+
helpers.Ensure("run", "-d", "--name", data.Identifier(),
730+
"--health-cmd", "echo healthy",
731+
"--health-interval", "1s",
732+
testutil.CommonImage, "sleep", "30")
733+
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
734+
helpers.Ensure("rm", "-f", data.Identifier())
735+
},
736+
Cleanup: func(data test.Data, helpers test.Helpers) {
737+
// Container is already removed, no cleanup needed
738+
helpers.Anyhow("rm", "-f", data.Identifier())
739+
},
740+
Expected: func(data test.Data, helpers test.Helpers) *test.Expected {
741+
return &test.Expected{
742+
ExitCode: expect.ExitCodeNoCheck,
743+
Output: func(stdout string, t tig.T) {
744+
inspect := nerdtest.InspectContainer(helpers, data.Identifier())
745+
containerID := inspect.ID
746+
747+
// Check systemd timers to ensure cleanup
748+
result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")
749+
result.Run(&test.Expected{
750+
ExitCode: expect.ExitCodeNoCheck,
751+
Output: func(stdout string, _ tig.T) {
752+
// Verify systemd timer has been cleaned up by checking systemctl output
753+
// We check that no timer contains our test identifier
754+
assert.Assert(t, !strings.Contains(stdout, containerID),
755+
"expected nerdctl healthcheck timer for container ID %s to be removed after container removal", containerID)
756+
},
757+
})
758+
},
759+
}
760+
},
761+
},
762+
{
763+
Description: "Stop cleans up systemd timer",
764+
Setup: func(data test.Data, helpers test.Helpers) {
765+
helpers.Ensure("run", "-d", "--name", data.Identifier(),
766+
"--health-cmd", "echo healthy",
767+
"--health-interval", "1s",
768+
testutil.CommonImage, "sleep", "30")
769+
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
770+
helpers.Ensure("stop", data.Identifier())
771+
},
772+
Cleanup: func(data test.Data, helpers test.Helpers) {
773+
// Container is already stopped, just remove it
774+
helpers.Anyhow("rm", "-f", data.Identifier())
775+
},
776+
Expected: func(data test.Data, helpers test.Helpers) *test.Expected {
777+
return &test.Expected{
778+
ExitCode: expect.ExitCodeNoCheck,
779+
Output: func(stdout string, t tig.T) {
780+
// Get container info for verification
781+
inspect := nerdtest.InspectContainer(helpers, data.Identifier())
782+
containerID := inspect.ID
783+
784+
// Ensure systemd timers are removed
785+
result := helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")
786+
result.Run(&test.Expected{
787+
ExitCode: expect.ExitCodeNoCheck,
788+
Output: func(stdout string, _ tig.T) {
789+
assert.Assert(t, !strings.Contains(stdout, containerID),
790+
"expected nerdctl healthcheck timer for container ID %s to be removed after container stop", containerID)
791+
},
792+
})
793+
},
673794
}
674795
},
675796
},
@@ -678,11 +799,13 @@ func TestHealthCheck_SystemdIntegration_Basic(t *testing.T) {
678799
}
679800

680801
func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
802+
803+
testCase := nerdtest.Setup()
804+
testCase.Require = require.Not(nerdtest.Docker)
805+
// Skip systemd tests in rootless environment to bypass dbus permission issues
681806
if rootlessutil.IsRootless() {
682807
t.Skip("systemd healthcheck tests are skipped in rootless environment")
683808
}
684-
testCase := nerdtest.Setup()
685-
testCase.Require = require.Not(nerdtest.Docker)
686809

687810
testCase.SubTests = []*test.Case{
688811
{
@@ -695,8 +818,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
695818
"--health-interval", "1s",
696819
testutil.CommonImage, "sleep", "30")
697820
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
698-
// Wait longer for systemd timer creation and first healthcheck execution
699-
time.Sleep(3 * time.Second)
700821
},
701822
Cleanup: func(data test.Data, helpers test.Helpers) {
702823
helpers.Anyhow("rm", "-f", data.Identifier())
@@ -724,7 +845,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
724845
})
725846
// Stop container and verify cleanup
726847
helpers.Ensure("stop", data.Identifier())
727-
time.Sleep(500 * time.Millisecond) // Allow cleanup to complete
728848

729849
// Check that timer is gone
730850
result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")
@@ -733,7 +853,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
733853
Output: func(stdout string, _ tig.T) {
734854
assert.Assert(t, !strings.Contains(stdout, containerID),
735855
"expected nerdctl healthcheck timer for container ID %s to be removed after container stop", containerID)
736-
737856
},
738857
})
739858
}),
@@ -748,7 +867,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
748867
"--health-interval", "2s",
749868
testutil.CommonImage, "sleep", "60")
750869
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
751-
time.Sleep(3 * time.Second) // Wait for initial timer creation
752870
},
753871
Cleanup: func(data test.Data, helpers test.Helpers) {
754872
helpers.Anyhow("rm", "-f", data.Identifier())
@@ -770,7 +888,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
770888

771889
// Step 2: Stop container
772890
helpers.Ensure("stop", data.Identifier())
773-
time.Sleep(1 * time.Second) // Allow cleanup
774891

775892
// Step 3: Verify timer is removed after stop
776893
result = helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")
@@ -785,7 +902,6 @@ func TestHealthCheck_SystemdIntegration_Advanced(t *testing.T) {
785902
// Step 4: Restart container
786903
helpers.Ensure("start", data.Identifier())
787904
nerdtest.EnsureContainerStarted(helpers, data.Identifier())
788-
time.Sleep(3 * time.Second) // Wait for timer recreation
789905

790906
// Step 5: Verify timer is recreated after restart - this is our final verification
791907
return helpers.Custom("systemctl", "list-timers", "--all", "--no-pager")

docs/healthchecks.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
`nerdctl` supports Docker-compatible health checks for containers, allowing users to monitor container health via a user-defined command.
44

55
## Configuration Options
6+
| :zap: Requirement | nerdctl >= 2.1.5 |
7+
|-------------------|----------------|
68

79
Health checks can be configured in multiple ways:
810

pkg/healthcheck/healthcheck_manager_darwin.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
3737
return nil
3838
}
3939

40-
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
41-
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
40+
// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
41+
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
42+
// on systemd operations. On Darwin, this is a no-op since systemd is not available.
43+
func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error {
4244
return nil
4345
}

pkg/healthcheck/healthcheck_manager_freebsd.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ func RemoveTransientHealthCheckFiles(ctx context.Context, container containerd.C
3737
return nil
3838
}
3939

40-
// RemoveTransientHealthCheckFilesByID stops and cleans up the transient timer and service using just the container ID.
41-
func RemoveTransientHealthCheckFilesByID(ctx context.Context, containerID string) error {
40+
// ForceRemoveTransientHealthCheckFiles forcefully stops and cleans up the transient timer and service
41+
// using just the container ID. This function is non-blocking and uses timeouts to prevent hanging
42+
// on systemd operations. On Darwin, this is a no-op since systemd is not available.
43+
func ForceRemoveTransientHealthCheckFiles(ctx context.Context, containerID string) error {
4244
return nil
4345
}

0 commit comments

Comments
 (0)