Skip to content

Commit 65e5aff

Browse files
authored
Ensure that file statuses reflect proxy and runtime status (#1380)
With the file-based statuses, we have the potential to go out of sync with the containers and proxy processes. Every time we read the file, if the file's status is "running", then check the status of the workload in the runtime, and check the proxy process. If there is disagreement, set the status in the file to "unhealthy". We may want to tweak this behaviour in future, for example, either deleting the unhealthy workloads, or perhaps trying to fix them. This is intended as a first step, since toolhive currently does not do a good job of detecting these problems.
1 parent 77fd543 commit 65e5aff

File tree

2 files changed

+338
-15
lines changed

2 files changed

+338
-15
lines changed

pkg/workloads/statuses/file_status.go

Lines changed: 100 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ import (
1414

1515
rt "github.com/stacklok/toolhive/pkg/container/runtime"
1616
"github.com/stacklok/toolhive/pkg/core"
17+
"github.com/stacklok/toolhive/pkg/labels"
1718
"github.com/stacklok/toolhive/pkg/logger"
19+
"github.com/stacklok/toolhive/pkg/transport/proxy"
1820
"github.com/stacklok/toolhive/pkg/workloads/types"
1921
)
2022

@@ -92,22 +94,9 @@ func (f *fileStatusManager) GetWorkload(ctx context.Context, workloadName string
9294
return core.Workload{}, err
9395
}
9496

95-
// If file was found and workload is running, get additional info from runtime
97+
// If file was found and workload is running, validate against runtime
9698
if fileFound && result.Status == rt.WorkloadStatusRunning {
97-
// TODO: Find discrepancies between the file and runtime workload.
98-
runtimeResult, err := f.getWorkloadFromRuntime(ctx, workloadName)
99-
if err != nil {
100-
return core.Workload{}, err
101-
}
102-
// Use runtime data but preserve file-based status info
103-
fileStatus := result.Status
104-
fileStatusContext := result.StatusContext
105-
fileCreatedAt := result.CreatedAt
106-
result = runtimeResult
107-
result.Status = fileStatus // Keep the file status
108-
result.StatusContext = fileStatusContext // Keep the file status context
109-
result.CreatedAt = fileCreatedAt // Keep the file created time
110-
return result, nil
99+
return f.validateRunningWorkload(ctx, workloadName, result)
111100
}
112101

113102
// If file was found and workload is not running, return file data
@@ -421,3 +410,99 @@ func (f *fileStatusManager) getWorkloadsFromFiles() (map[string]core.Workload, e
421410

422411
return workloads, nil
423412
}
413+
414+
// validateRunningWorkload validates that a workload marked as running in the file
415+
// is actually running in the runtime and has a healthy proxy process if applicable.
416+
func (f *fileStatusManager) validateRunningWorkload(
417+
ctx context.Context, workloadName string, result core.Workload,
418+
) (core.Workload, error) {
419+
// Get raw container info from runtime (before label filtering)
420+
containerInfo, err := f.runtime.GetWorkloadInfo(ctx, workloadName)
421+
if err != nil {
422+
return core.Workload{}, err
423+
}
424+
425+
// Check if runtime status matches file status
426+
if containerInfo.State != rt.WorkloadStatusRunning {
427+
return f.handleRuntimeMismatch(ctx, workloadName, result, containerInfo)
428+
}
429+
430+
// Check if proxy process is running when workload is running
431+
if unhealthyWorkload, isUnhealthy := f.checkProxyHealth(ctx, workloadName, result, containerInfo); isUnhealthy {
432+
return unhealthyWorkload, nil
433+
}
434+
435+
// Runtime and proxy confirm workload is healthy - merge runtime data with file status
436+
return f.mergeHealthyWorkloadData(containerInfo, result)
437+
}
438+
439+
// handleRuntimeMismatch handles the case where file indicates running but runtime shows different status
440+
func (f *fileStatusManager) handleRuntimeMismatch(
441+
ctx context.Context, workloadName string, result core.Workload, containerInfo rt.ContainerInfo,
442+
) (core.Workload, error) {
443+
contextMsg := fmt.Sprintf("workload status mismatch: file indicates running, but runtime shows %s", containerInfo.State)
444+
if err := f.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusUnhealthy, contextMsg); err != nil {
445+
logger.Warnf("failed to update workload %s status to unhealthy: %v", workloadName, err)
446+
}
447+
448+
// Convert to workload and return unhealthy status
449+
runtimeResult, err := types.WorkloadFromContainerInfo(&containerInfo)
450+
if err != nil {
451+
return core.Workload{}, err
452+
}
453+
454+
runtimeResult.Status = rt.WorkloadStatusUnhealthy
455+
runtimeResult.StatusContext = contextMsg
456+
runtimeResult.CreatedAt = result.CreatedAt // Keep the original file created time
457+
return runtimeResult, nil
458+
}
459+
460+
// checkProxyHealth checks if the proxy process is running for the workload.
461+
// Returns (unhealthyWorkload, true) if proxy is not running, (emptyWorkload, false) if proxy is healthy or not applicable.
462+
func (f *fileStatusManager) checkProxyHealth(
463+
ctx context.Context, workloadName string, result core.Workload, containerInfo rt.ContainerInfo,
464+
) (core.Workload, bool) {
465+
// Use original container labels (before filtering) to get base name
466+
baseName := labels.GetContainerBaseName(containerInfo.Labels)
467+
if baseName == "" {
468+
return core.Workload{}, false // No proxy check needed
469+
}
470+
471+
proxyRunning := proxy.IsRunning(baseName)
472+
if proxyRunning {
473+
return core.Workload{}, false // Proxy is healthy
474+
}
475+
476+
// Proxy is not running, but workload should be running
477+
contextMsg := fmt.Sprintf("proxy process not running: workload shows running but proxy process for %s is not active",
478+
baseName)
479+
if err := f.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusUnhealthy, contextMsg); err != nil {
480+
logger.Warnf("failed to update workload %s status to unhealthy: %v", workloadName, err)
481+
}
482+
483+
// Convert to workload and return unhealthy status
484+
runtimeResult, err := types.WorkloadFromContainerInfo(&containerInfo)
485+
if err != nil {
486+
logger.Warnf("failed to convert container info for unhealthy workload %s: %v", workloadName, err)
487+
return core.Workload{}, false // Return false to avoid double error handling
488+
}
489+
490+
runtimeResult.Status = rt.WorkloadStatusUnhealthy
491+
runtimeResult.StatusContext = contextMsg
492+
runtimeResult.CreatedAt = result.CreatedAt // Keep the original file created time
493+
return runtimeResult, true
494+
}
495+
496+
// mergeHealthyWorkloadData merges runtime container data with file-based status information
497+
func (*fileStatusManager) mergeHealthyWorkloadData(containerInfo rt.ContainerInfo, result core.Workload) (core.Workload, error) {
498+
// Runtime and proxy confirm workload is healthy - use runtime data but preserve file-based status info
499+
runtimeResult, err := types.WorkloadFromContainerInfo(&containerInfo)
500+
if err != nil {
501+
return core.Workload{}, err
502+
}
503+
504+
runtimeResult.Status = result.Status // Keep the file status (running)
505+
runtimeResult.StatusContext = result.StatusContext // Keep the file status context
506+
runtimeResult.CreatedAt = result.CreatedAt // Keep the file created time
507+
return runtimeResult, nil
508+
}

pkg/workloads/statuses/file_status_test.go

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,3 +622,241 @@ func TestFileStatusManager_ListWorkloads(t *testing.T) {
622622
})
623623
}
624624
}
625+
626+
func TestFileStatusManager_GetWorkload_UnhealthyDetection(t *testing.T) {
627+
t.Parallel()
628+
629+
ctrl := gomock.NewController(t)
630+
defer ctrl.Finish()
631+
632+
tempDir := t.TempDir()
633+
mockRuntime := mocks.NewMockRuntime(ctrl)
634+
manager := &fileStatusManager{
635+
baseDir: tempDir,
636+
runtime: mockRuntime,
637+
}
638+
ctx := context.Background()
639+
640+
// First, set the workload status to running in the file
641+
err := manager.SetWorkloadStatus(ctx, "test-workload", rt.WorkloadStatusRunning, "container started")
642+
require.NoError(t, err)
643+
644+
// Mock the runtime to return a stopped workload (mismatch with file)
645+
stoppedInfo := rt.ContainerInfo{
646+
Name: "test-workload",
647+
Image: "test-image:latest",
648+
Status: "Exited (0) 2 minutes ago",
649+
State: rt.WorkloadStatusStopped, // Runtime says stopped
650+
Created: time.Now().Add(-10 * time.Minute),
651+
Labels: map[string]string{
652+
"toolhive": "true",
653+
"toolhive-name": "test-workload",
654+
},
655+
}
656+
657+
mockRuntime.EXPECT().
658+
GetWorkloadInfo(gomock.Any(), "test-workload").
659+
Return(stoppedInfo, nil)
660+
661+
// Mock the call to SetWorkloadStatus that will be made to update to unhealthy
662+
// This is tricky because we need to intercept the call but allow it to proceed
663+
// For simplicity, we'll just allow the call to succeed
664+
mockRuntime.EXPECT().
665+
GetWorkloadInfo(gomock.Any(), "test-workload").
666+
Return(stoppedInfo, nil).
667+
AnyTimes() // Allow multiple calls during the SetWorkloadStatus operation
668+
669+
// Get the workload - this should detect the mismatch and return unhealthy status
670+
workload, err := manager.GetWorkload(ctx, "test-workload")
671+
require.NoError(t, err)
672+
673+
// Verify the workload is marked as unhealthy
674+
assert.Equal(t, "test-workload", workload.Name)
675+
assert.Equal(t, rt.WorkloadStatusUnhealthy, workload.Status)
676+
assert.Contains(t, workload.StatusContext, "workload status mismatch")
677+
assert.Contains(t, workload.StatusContext, "file indicates running")
678+
assert.Contains(t, workload.StatusContext, "runtime shows stopped")
679+
assert.Equal(t, "test-image:latest", workload.Package)
680+
681+
// Verify the file was updated to unhealthy status
682+
// Get the workload again (this time without runtime mismatch since status is now unhealthy)
683+
statusFilePath := filepath.Join(tempDir, "test-workload.json")
684+
data, err := os.ReadFile(statusFilePath)
685+
require.NoError(t, err)
686+
687+
var statusFile workloadStatusFile
688+
err = json.Unmarshal(data, &statusFile)
689+
require.NoError(t, err)
690+
691+
assert.Equal(t, rt.WorkloadStatusUnhealthy, statusFile.Status)
692+
assert.Contains(t, statusFile.StatusContext, "workload status mismatch")
693+
}
694+
695+
func TestFileStatusManager_GetWorkload_HealthyRunningWorkload(t *testing.T) {
696+
t.Parallel()
697+
698+
ctrl := gomock.NewController(t)
699+
defer ctrl.Finish()
700+
701+
tempDir := t.TempDir()
702+
mockRuntime := mocks.NewMockRuntime(ctrl)
703+
manager := &fileStatusManager{
704+
baseDir: tempDir,
705+
runtime: mockRuntime,
706+
}
707+
ctx := context.Background()
708+
709+
// Set the workload status to running in the file
710+
err := manager.SetWorkloadStatus(ctx, "healthy-workload", rt.WorkloadStatusRunning, "container started")
711+
require.NoError(t, err)
712+
713+
// Mock the runtime to return a running workload (matches file)
714+
runningInfo := rt.ContainerInfo{
715+
Name: "healthy-workload",
716+
Image: "test-image:latest",
717+
Status: "Up 5 minutes",
718+
State: rt.WorkloadStatusRunning, // Runtime says running (matches file)
719+
Created: time.Now().Add(-10 * time.Minute),
720+
Labels: map[string]string{
721+
"toolhive": "true",
722+
"toolhive-name": "healthy-workload",
723+
},
724+
}
725+
726+
mockRuntime.EXPECT().
727+
GetWorkloadInfo(gomock.Any(), "healthy-workload").
728+
Return(runningInfo, nil)
729+
730+
// Get the workload - this should remain running since file and runtime match
731+
workload, err := manager.GetWorkload(ctx, "healthy-workload")
732+
require.NoError(t, err)
733+
734+
// Verify the workload remains running
735+
assert.Equal(t, "healthy-workload", workload.Name)
736+
assert.Equal(t, rt.WorkloadStatusRunning, workload.Status)
737+
assert.Equal(t, "container started", workload.StatusContext) // Original file context preserved
738+
assert.Equal(t, "test-image:latest", workload.Package)
739+
}
740+
741+
func TestFileStatusManager_GetWorkload_ProxyNotRunning(t *testing.T) {
742+
t.Parallel()
743+
744+
ctrl := gomock.NewController(t)
745+
defer ctrl.Finish()
746+
747+
tempDir := t.TempDir()
748+
mockRuntime := mocks.NewMockRuntime(ctrl)
749+
750+
// Create file status manager directly instead of using NewFileStatusManager
751+
manager := &fileStatusManager{
752+
baseDir: tempDir,
753+
runtime: mockRuntime,
754+
}
755+
ctx := context.Background()
756+
757+
// First, create a status file manually to ensure file is found
758+
statusFile := workloadStatusFile{
759+
Status: rt.WorkloadStatusRunning,
760+
StatusContext: "container started",
761+
CreatedAt: time.Now(),
762+
UpdatedAt: time.Now(),
763+
}
764+
statusFilePath := filepath.Join(tempDir, "proxy-down-workload.json")
765+
statusData, err := json.Marshal(statusFile)
766+
require.NoError(t, err)
767+
err = os.WriteFile(statusFilePath, statusData, 0644)
768+
require.NoError(t, err)
769+
770+
// Mock the runtime to return a running workload with proper labels
771+
runningInfo := rt.ContainerInfo{
772+
Name: "proxy-down-workload",
773+
Image: "test-image:latest",
774+
Status: "Up 5 minutes",
775+
State: rt.WorkloadStatusRunning, // Runtime says running (matches file)
776+
Created: time.Now().Add(-10 * time.Minute),
777+
Labels: map[string]string{
778+
"toolhive": "true",
779+
"toolhive-name": "proxy-down-workload",
780+
"toolhive-basename": "proxy-down-workload", // This is the base name for proxy
781+
},
782+
}
783+
784+
// Mock the GetWorkloadInfo call that will be made during the proxy check
785+
mockRuntime.EXPECT().
786+
GetWorkloadInfo(gomock.Any(), "proxy-down-workload").
787+
Return(runningInfo, nil).
788+
AnyTimes() // Allow multiple calls during the SetWorkloadStatus operation as well
789+
790+
// Note: proxy.IsRunning will check the actual system, but since there's no proxy
791+
// process running for "proxy-down-workload", it will return false
792+
793+
// Get the workload - this should detect the proxy is not running and return unhealthy
794+
workload, err := manager.GetWorkload(ctx, "proxy-down-workload")
795+
require.NoError(t, err)
796+
797+
// Verify the workload is marked as unhealthy due to proxy not running
798+
assert.Equal(t, "proxy-down-workload", workload.Name)
799+
assert.Equal(t, rt.WorkloadStatusUnhealthy, workload.Status)
800+
assert.Contains(t, workload.StatusContext, "proxy process not running")
801+
assert.Contains(t, workload.StatusContext, "proxy-down-workload")
802+
assert.Contains(t, workload.StatusContext, "not active")
803+
assert.Equal(t, "test-image:latest", workload.Package)
804+
805+
// Verify the file was updated to unhealthy status
806+
data, err := os.ReadFile(statusFilePath)
807+
require.NoError(t, err)
808+
809+
var updatedStatusFile workloadStatusFile
810+
err = json.Unmarshal(data, &updatedStatusFile)
811+
require.NoError(t, err)
812+
813+
assert.Equal(t, rt.WorkloadStatusUnhealthy, updatedStatusFile.Status)
814+
assert.Contains(t, updatedStatusFile.StatusContext, "proxy process not running")
815+
}
816+
817+
func TestFileStatusManager_GetWorkload_HealthyWithProxy(t *testing.T) {
818+
t.Parallel()
819+
820+
ctrl := gomock.NewController(t)
821+
defer ctrl.Finish()
822+
823+
tempDir := t.TempDir()
824+
mockRuntime := mocks.NewMockRuntime(ctrl)
825+
manager := &fileStatusManager{
826+
baseDir: tempDir,
827+
runtime: mockRuntime,
828+
}
829+
ctx := context.Background()
830+
831+
// Set the workload status to running in the file
832+
err := manager.SetWorkloadStatus(ctx, "healthy-with-proxy", rt.WorkloadStatusRunning, "container started")
833+
require.NoError(t, err)
834+
835+
// Mock the runtime to return a running workload without base name (no proxy check)
836+
runningInfo := rt.ContainerInfo{
837+
Name: "healthy-with-proxy",
838+
Image: "test-image:latest",
839+
Status: "Up 5 minutes",
840+
State: rt.WorkloadStatusRunning,
841+
Created: time.Now().Add(-10 * time.Minute),
842+
Labels: map[string]string{
843+
"toolhive": "true",
844+
"toolhive-name": "healthy-with-proxy",
845+
// No toolhive-base-name label, so proxy check will be skipped
846+
},
847+
}
848+
849+
mockRuntime.EXPECT().
850+
GetWorkloadInfo(gomock.Any(), "healthy-with-proxy").
851+
Return(runningInfo, nil)
852+
853+
// Get the workload - this should remain running since there's no base name for proxy check
854+
workload, err := manager.GetWorkload(ctx, "healthy-with-proxy")
855+
require.NoError(t, err)
856+
857+
// Verify the workload remains running (no proxy check due to missing base name)
858+
assert.Equal(t, "healthy-with-proxy", workload.Name)
859+
assert.Equal(t, rt.WorkloadStatusRunning, workload.Status)
860+
assert.Equal(t, "container started", workload.StatusContext) // Original file context preserved
861+
assert.Equal(t, "test-image:latest", workload.Package)
862+
}

0 commit comments

Comments
 (0)