Skip to content

Commit e39040c

Browse files
authored
Ensure that failed starts/restarts are cleaned up properly (#1421)
There were a number of places where a workload start could fail, and not set the status of the workload to error. This identifies and fixes those issues. This change increased the cyclomatic complexity of the restart method beyond our limit, so I refactored the function to simplify it.
1 parent 0d88db7 commit e39040c

File tree

1 file changed

+118
-63
lines changed

1 file changed

+118
-63
lines changed

pkg/workloads/manager.go

Lines changed: 118 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,10 @@ func (d *defaultManager) RunWorkloadDetached(ctx context.Context, runConfig *run
275275

276276
// Start the detached process
277277
if err := detachedCmd.Start(); err != nil {
278+
// If the start failed, we need to set the status to error before returning.
279+
if err := d.statuses.SetWorkloadStatus(ctx, runConfig.BaseName, rt.WorkloadStatusError, ""); err != nil {
280+
logger.Warnf("Failed to set workload %s status to error: %v", runConfig.BaseName, err)
281+
}
278282
return fmt.Errorf("failed to start detached process: %v", err)
279283
}
280284

@@ -444,79 +448,130 @@ func (d *defaultManager) RestartWorkloads(ctx context.Context, names []string, f
444448

445449
for _, name := range names {
446450
group.Go(func() error {
447-
// Create a child context with a longer timeout
448-
childCtx, cancel := context.WithTimeout(context.Background(), AsyncOperationTimeout)
449-
defer cancel()
451+
return d.restartSingleWorkload(ctx, name, foreground)
452+
})
453+
}
450454

451-
// NOTE: Once we have the status manager implemented, we can use it
452-
// to ensure that the workload exists and is in the `stopped` state
453-
// before restarting.
454-
var containerBaseName string
455-
var running bool
456-
// Try to find the container.
457-
container, err := d.runtime.GetWorkloadInfo(childCtx, name)
458-
if err != nil {
459-
if errors.Is(err, rt.ErrWorkloadNotFound) {
460-
logger.Warnf("Warning: Failed to find container: %v", err)
461-
logger.Warnf("Trying to find state with name %s directly...", name)
462-
463-
// Try to use the provided name as the base name
464-
containerBaseName = name
465-
running = false
466-
} else {
467-
return fmt.Errorf("failed to find workload %s: %v", name, err)
468-
}
469-
} else {
470-
// Container found, check if it's running and get the base name,
471-
running = container.IsRunning()
472-
containerBaseName = labels.GetContainerBaseName(container.Labels)
473-
}
455+
return group, nil
456+
}
474457

475-
// Check if the proxy process is running
476-
proxyRunning := proxy.IsRunning(containerBaseName)
458+
// restartSingleWorkload handles the restart logic for a single workload
459+
func (d *defaultManager) restartSingleWorkload(ctx context.Context, name string, foreground bool) error {
460+
// Create a child context with a longer timeout
461+
childCtx, cancel := context.WithTimeout(context.Background(), AsyncOperationTimeout)
462+
defer cancel()
477463

478-
if running && proxyRunning {
479-
logger.Infof("Container %s and proxy are already running", name)
480-
return nil
481-
}
464+
// Get workload state information
465+
workloadState, err := d.getWorkloadState(childCtx, name)
466+
if err != nil {
467+
return err
468+
}
482469

483-
// Load the configuration from the state store
484-
// This is done synchronously since it is relatively inexpensive operation
485-
// and it allows for better error handling.
486-
mcpRunner, err := d.loadRunnerFromState(childCtx, containerBaseName)
487-
if err != nil {
488-
// TODO: If the state file has gone missing, we should delete
489-
// the workload - since there is no chance of recovery.
490-
return fmt.Errorf("failed to load state for %s: %v", containerBaseName, err)
491-
}
470+
// Check if already running
471+
if d.isWorkloadAlreadyRunning(name, workloadState) {
472+
return nil
473+
}
492474

493-
// At this point we're sure that the workload exists but is not running.
494-
// Transition workload to `starting` state.
495-
if err := d.statuses.SetWorkloadStatus(ctx, name, rt.WorkloadStatusStarting, ""); err != nil {
496-
logger.Warnf("Failed to set workload %s status to starting: %v", name, err)
497-
}
498-
logger.Infof("Loaded configuration from state for %s", containerBaseName)
475+
// Load runner configuration from state
476+
mcpRunner, err := d.loadRunnerFromState(childCtx, workloadState.BaseName)
477+
if err != nil {
478+
return fmt.Errorf("failed to load state for %s: %v", workloadState.BaseName, err)
479+
}
499480

500-
// Run the tooling server inside a detached process.
501-
logger.Infof("Starting tooling server %s...", name)
481+
// Set workload status to starting
482+
if err := d.statuses.SetWorkloadStatus(ctx, name, rt.WorkloadStatusStarting, ""); err != nil {
483+
logger.Warnf("Failed to set workload %s status to starting: %v", name, err)
484+
}
485+
logger.Infof("Loaded configuration from state for %s", workloadState.BaseName)
502486

503-
// If the container is running but the proxy is not, stop the container first
504-
if running { // && !proxyRunning was previously here but is implied by previous if statement.
505-
logger.Infof("Container %s is running but proxy is not. Stopping container...", name)
506-
if err = d.runtime.StopWorkload(childCtx, name); err != nil {
507-
return fmt.Errorf("failed to stop container %s: %v", name, err)
508-
}
509-
logger.Infof("Container %s stopped", name)
510-
}
487+
// Stop container if running but proxy is not
488+
if err := d.stopContainerIfNeeded(childCtx, ctx, name, workloadState); err != nil {
489+
return err
490+
}
511491

512-
if foreground {
513-
return d.RunWorkload(ctx, mcpRunner.Config)
514-
}
515-
return d.RunWorkloadDetached(ctx, mcpRunner.Config)
516-
})
492+
// Start the workload
493+
return d.startWorkload(ctx, name, mcpRunner, foreground)
494+
}
495+
496+
// workloadState holds the current state of a workload for restart operations
497+
type workloadState struct {
498+
BaseName string
499+
Running bool
500+
ProxyRunning bool
501+
}
502+
503+
// getWorkloadState retrieves the current state of a workload
504+
func (d *defaultManager) getWorkloadState(ctx context.Context, name string) (*workloadState, error) {
505+
workloadSt := &workloadState{}
506+
507+
// Try to find the container
508+
container, err := d.runtime.GetWorkloadInfo(ctx, name)
509+
if err != nil {
510+
if errors.Is(err, rt.ErrWorkloadNotFound) {
511+
logger.Warnf("Warning: Failed to find container: %v", err)
512+
logger.Warnf("Trying to find state with name %s directly...", name)
513+
// Try to use the provided name as the base name
514+
workloadSt.BaseName = name
515+
workloadSt.Running = false
516+
} else {
517+
return nil, fmt.Errorf("failed to find workload %s: %v", name, err)
518+
}
519+
} else {
520+
// Container found, check if it's running and get the base name
521+
workloadSt.Running = container.IsRunning()
522+
workloadSt.BaseName = labels.GetContainerBaseName(container.Labels)
517523
}
518524

519-
return group, nil
525+
// Check if the proxy process is running
526+
workloadSt.ProxyRunning = proxy.IsRunning(workloadSt.BaseName)
527+
528+
return workloadSt, nil
529+
}
530+
531+
// isWorkloadAlreadyRunning checks if the workload is already fully running
532+
func (*defaultManager) isWorkloadAlreadyRunning(name string, workloadSt *workloadState) bool {
533+
if workloadSt.Running && workloadSt.ProxyRunning {
534+
logger.Infof("Container %s and proxy are already running", name)
535+
return true
536+
}
537+
return false
538+
}
539+
540+
// stopContainerIfNeeded stops the container if it's running but proxy is not
541+
func (d *defaultManager) stopContainerIfNeeded(childCtx, ctx context.Context, name string, workloadSt *workloadState) error {
542+
if !workloadSt.Running {
543+
return nil
544+
}
545+
546+
logger.Infof("Container %s is running but proxy is not. Stopping container...", name)
547+
if err := d.runtime.StopWorkload(childCtx, name); err != nil {
548+
if statusErr := d.statuses.SetWorkloadStatus(ctx, name, rt.WorkloadStatusError, ""); statusErr != nil {
549+
logger.Warnf("Failed to set workload %s status to error: %v", name, statusErr)
550+
}
551+
return fmt.Errorf("failed to stop container %s: %v", name, err)
552+
}
553+
logger.Infof("Container %s stopped", name)
554+
return nil
555+
}
556+
557+
// startWorkload starts the workload in either foreground or background mode
558+
func (d *defaultManager) startWorkload(ctx context.Context, name string, mcpRunner *runner.Runner, foreground bool) error {
559+
logger.Infof("Starting tooling server %s...", name)
560+
561+
var err error
562+
if foreground {
563+
err = d.RunWorkload(ctx, mcpRunner.Config)
564+
} else {
565+
err = d.RunWorkloadDetached(ctx, mcpRunner.Config)
566+
}
567+
568+
if err != nil {
569+
// If we could not start the workload, set the status to error before returning
570+
if statusErr := d.statuses.SetWorkloadStatus(ctx, name, rt.WorkloadStatusError, ""); statusErr != nil {
571+
logger.Warnf("Failed to set workload %s status to error: %v", name, statusErr)
572+
}
573+
}
574+
return err
520575
}
521576

522577
// TODO: Move to dedicated config management interface.

0 commit comments

Comments
 (0)