Skip to content

Commit c5f711b

Browse files
committed
Update healthcheck orchestration logic
Signed-off-by: Arjun Raja Yogidas <[email protected]>
1 parent 3cdfccb commit c5f711b

File tree

13 files changed

+161
-66
lines changed

13 files changed

+161
-66
lines changed

cmd/nerdctl/container/container_create.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,10 +279,6 @@ func createOptions(cmd *cobra.Command) (types.ContainerCreateOptions, error) {
279279
if err != nil {
280280
return opt, err
281281
}
282-
opt.HealthStartInterval, err = cmd.Flags().GetDuration("health-start-interval")
283-
if err != nil {
284-
return opt, err
285-
}
286282
opt.NoHealthcheck, err = cmd.Flags().GetBool("no-healthcheck")
287283
if err != nil {
288284
return opt, err

cmd/nerdctl/container/container_run.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ package container
1919
import (
2020
"errors"
2121
"fmt"
22-
"github.com/containerd/nerdctl/v2/pkg/healthcheck"
2322
"runtime"
2423
"strings"
2524

25+
"github.com/containerd/nerdctl/v2/pkg/healthcheck"
26+
2627
"github.com/spf13/cobra"
2728
"golang.org/x/term"
2829

@@ -241,7 +242,6 @@ func setCreateFlags(cmd *cobra.Command) {
241242
cmd.Flags().Duration("health-timeout", 0, "Maximum time to allow one check to run (default: 30s)")
242243
cmd.Flags().Int("health-retries", 0, "Consecutive failures needed to report unhealthy (default: 3)")
243244
cmd.Flags().Duration("health-start-period", 0, "Start period for the container to initialize before starting health-retries countdown")
244-
cmd.Flags().Duration("health-start-interval", 0, "Time between running the checks during the start period")
245245
cmd.Flags().Bool("no-healthcheck", false, "Disable any container-specified HEALTHCHECK")
246246

247247
// #region env flags

cmd/nerdctl/helpers/flagutil.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error {
5252
options.HealthInterval != 0 ||
5353
options.HealthTimeout != 0 ||
5454
options.HealthRetries != 0 ||
55-
options.HealthStartPeriod != 0 ||
56-
options.HealthStartInterval != 0
55+
options.HealthStartPeriod != 0
5756

5857
if options.NoHealthcheck {
5958
if options.HealthCmd != "" || healthFlagsSet {
@@ -74,9 +73,6 @@ func ValidateHealthcheckFlags(options types.ContainerCreateOptions) error {
7473
if options.HealthStartPeriod < 0 {
7574
return fmt.Errorf("--health-start-period cannot be negative")
7675
}
77-
if options.HealthStartInterval < 0 {
78-
return fmt.Errorf("--health-start-interval cannot be negative")
79-
}
8076
return nil
8177
}
8278

pkg/api/types/container_types.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -285,13 +285,12 @@ type ContainerCreateOptions struct {
285285
ImagePullOpt ImagePullOptions
286286

287287
// Healthcheck related fields
288-
HealthCmd string
289-
HealthInterval time.Duration
290-
HealthTimeout time.Duration
291-
HealthRetries int
292-
HealthStartPeriod time.Duration
293-
HealthStartInterval time.Duration
294-
NoHealthcheck bool
288+
HealthCmd string
289+
HealthInterval time.Duration
290+
HealthTimeout time.Duration
291+
HealthRetries int
292+
HealthStartPeriod time.Duration
293+
NoHealthcheck bool
295294

296295
// UserNS name for user namespace mapping of container
297296
UserNS string

pkg/cmd/container/create.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -891,9 +891,6 @@ func withHealthcheck(options types.ContainerCreateOptions, ensuredImage *imgutil
891891
if options.HealthStartPeriod != 0 {
892892
hc.StartPeriod = options.HealthStartPeriod
893893
}
894-
if options.HealthStartInterval != 0 {
895-
hc.StartInterval = options.HealthStartInterval
896-
}
897894

898895
// If no healthcheck config is set (via CLI or image), return empty string so we skip adding to container config.
899896
if reflect.DeepEqual(hc, &healthcheck.Healthcheck{}) {

pkg/cmd/container/health_check.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ func HealthCheck(ctx context.Context, client *containerd.Client, container conta
5959
hcConfig.Interval = timeoutWithDefault(hcConfig.Interval, healthcheck.DefaultProbeInterval)
6060
hcConfig.Timeout = timeoutWithDefault(hcConfig.Timeout, healthcheck.DefaultProbeTimeout)
6161
hcConfig.StartPeriod = timeoutWithDefault(hcConfig.StartPeriod, healthcheck.DefaultStartPeriod)
62-
hcConfig.StartInterval = timeoutWithDefault(hcConfig.StartInterval, healthcheck.DefaultStartInterval)
6362
if hcConfig.Retries == 0 {
6463
hcConfig.Retries = healthcheck.DefaultProbeRetries
6564
}

pkg/cmd/container/kill.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"github.com/containerd/nerdctl/v2/pkg/api/types"
3636
"github.com/containerd/nerdctl/v2/pkg/clientutil"
3737
"github.com/containerd/nerdctl/v2/pkg/containerutil"
38+
"github.com/containerd/nerdctl/v2/pkg/healthcheck"
3839
"github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker"
3940
"github.com/containerd/nerdctl/v2/pkg/labels"
4041
"github.com/containerd/nerdctl/v2/pkg/netutil"
@@ -112,9 +113,9 @@ func killContainer(ctx context.Context, container containerd.Container, signal s
112113
}
113114

114115
// Clean up healthcheck systemd units
115-
// if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil {
116-
// log.G(ctx).Warnf("failed to clean up healthcheck units for container %s: %s", container.ID(), err)
117-
// }
116+
if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, container); err != nil {
117+
log.G(ctx).Warnf("failed to clean up healthcheck units for container %s: %s", container.ID(), err)
118+
}
118119

119120
// signal will be sent once resume is finished
120121
if paused {

pkg/cmd/container/remove.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"github.com/containerd/nerdctl/v2/pkg/clientutil"
3535
"github.com/containerd/nerdctl/v2/pkg/containerutil"
3636
"github.com/containerd/nerdctl/v2/pkg/dnsutil/hostsstore"
37+
"github.com/containerd/nerdctl/v2/pkg/healthcheck"
3738
"github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker"
3839
"github.com/containerd/nerdctl/v2/pkg/ipcutil"
3940
"github.com/containerd/nerdctl/v2/pkg/labels"
@@ -180,9 +181,9 @@ func RemoveContainer(ctx context.Context, c containerd.Container, globalOptions
180181
retErr = nil
181182

182183
// Clean up healthcheck systemd units
183-
// if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, c); err != nil {
184-
// log.G(ctx).WithError(err).Warnf("failed to clean up healthcheck units for container %q", id)
185-
// }
184+
if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, c); err != nil {
185+
log.G(ctx).WithError(err).Warnf("failed to clean up healthcheck units for container %q", id)
186+
}
186187

187188
// Now, delete the actual container
188189
var delOpts []containerd.DeleteOpts

pkg/cmd/container/stop.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525

2626
"github.com/containerd/nerdctl/v2/pkg/api/types"
2727
"github.com/containerd/nerdctl/v2/pkg/containerutil"
28+
"github.com/containerd/nerdctl/v2/pkg/healthcheck"
2829
"github.com/containerd/nerdctl/v2/pkg/idutil/containerwalker"
2930
)
3031

@@ -39,6 +40,9 @@ func Stop(ctx context.Context, client *containerd.Client, reqs []string, opt typ
3940
if err := cleanupNetwork(ctx, found.Container, opt.GOptions); err != nil {
4041
return fmt.Errorf("unable to cleanup network for container: %s", found.Req)
4142
}
43+
if err := healthcheck.RemoveTransientHealthCheckFiles(ctx, found.Container); err != nil {
44+
return fmt.Errorf("unable to cleanup healthcheck timer for container: %s: %w", found.Req, err)
45+
}
4246
if err := containerutil.Stop(ctx, found.Container, opt.Timeout, opt.Signal); err != nil {
4347
if errdefs.IsNotFound(err) {
4448
fmt.Fprintf(opt.Stderr, "No such container: %s\n", found.Req)

pkg/healthcheck/executor.go

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -128,26 +128,42 @@ func updateHealthStatus(ctx context.Context, container containerd.Container, hcC
128128
currentHealth = &HealthState{
129129
Status: Starting,
130130
FailingStreak: 0,
131+
StartPeriod: hcConfig.StartPeriod > 0,
131132
}
132133
}
133134

134-
// Check if still within start period
135-
startPeriod := hcConfig.StartPeriod
135+
// Get container info for start period check
136136
info, err := container.Info(ctx)
137137
if err != nil {
138138
return fmt.Errorf("failed to get container info: %w", err)
139139
}
140140
containerCreated := info.CreatedAt
141-
stillInStartPeriod := hcResult.Start.Sub(containerCreated) < startPeriod
142-
143-
// Update health status based on exit code
144-
if hcResult.ExitCode == 0 {
145-
currentHealth.Status = Healthy
146-
currentHealth.FailingStreak = 0
147-
} else if !stillInStartPeriod {
148-
currentHealth.FailingStreak++
149-
if currentHealth.FailingStreak >= hcConfig.Retries {
150-
currentHealth.Status = Unhealthy
141+
142+
// Check if we're in start period workflow
143+
inStartPeriodTime := hcResult.Start.Sub(containerCreated) < hcConfig.StartPeriod
144+
inStartPeriodState := currentHealth.StartPeriod
145+
146+
if inStartPeriodTime && inStartPeriodState {
147+
// Start Period Workflow
148+
if hcResult.ExitCode == 0 {
149+
// First healthy result transitions us out of start period
150+
currentHealth.Status = Healthy
151+
currentHealth.FailingStreak = 0
152+
currentHealth.StartPeriod = false
153+
}
154+
// Ignore unhealthy results during start period
155+
} else {
156+
// Health Interval Workflow
157+
if hcResult.ExitCode == 0 {
158+
if currentHealth.Status != Healthy {
159+
currentHealth.Status = Healthy
160+
currentHealth.FailingStreak = 0
161+
}
162+
} else {
163+
currentHealth.FailingStreak++
164+
if currentHealth.FailingStreak >= hcConfig.Retries && currentHealth.Status != Unhealthy {
165+
currentHealth.Status = Unhealthy
166+
}
151167
}
152168
}
153169

0 commit comments

Comments
 (0)