Skip to content

Commit 529ef6d

Browse files
authored
Migrate PID files to file statuses on status Get/List (#1802)
This code introduces changes into the status logic which attempts to load the PID from a PID file on disk if the PID is missing from the status file.
1 parent 2b4f9aa commit 529ef6d

File tree

2 files changed

+302
-0
lines changed

2 files changed

+302
-0
lines changed

pkg/workloads/statuses/file_status.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/stacklok/toolhive/pkg/labels"
1818
"github.com/stacklok/toolhive/pkg/lockfile"
1919
"github.com/stacklok/toolhive/pkg/logger"
20+
"github.com/stacklok/toolhive/pkg/process"
2021
"github.com/stacklok/toolhive/pkg/state"
2122
"github.com/stacklok/toolhive/pkg/transport/proxy"
2223
"github.com/stacklok/toolhive/pkg/workloads/types"
@@ -134,6 +135,23 @@ func (f *fileStatusManager) GetWorkload(ctx context.Context, workloadName string
134135
result.StatusContext = statusFile.StatusContext
135136
result.CreatedAt = statusFile.CreatedAt
136137
fileFound = true
138+
139+
// Check if PID migration is needed
140+
if statusFile.Status == rt.WorkloadStatusRunning && statusFile.ProcessID == 0 {
141+
// Try PID migration - the migration function will handle cases
142+
// where container info is not available gracefully
143+
if migratedPID, wasMigrated := f.migratePIDFromFile(workloadName, nil); wasMigrated {
144+
// Update the status file with the migrated PID
145+
statusFile.ProcessID = migratedPID
146+
statusFile.UpdatedAt = time.Now()
147+
if err := f.writeStatusFile(statusFilePath, *statusFile); err != nil {
148+
logger.Warnf("failed to write migrated PID for workload %s: %v", workloadName, err)
149+
} else {
150+
logger.Debugf("successfully migrated PID %d to status file for workload %s", migratedPID, workloadName)
151+
}
152+
}
153+
}
154+
137155
return nil
138156
})
139157
if err != nil {
@@ -346,6 +364,44 @@ func (f *fileStatusManager) ResetWorkloadPID(ctx context.Context, workloadName s
346364
return f.SetWorkloadPID(ctx, workloadName, 0)
347365
}
348366

367+
// migratePIDFromFile migrates PID from legacy PID file to status file if needed.
368+
// This is called when the status is running and ProcessID is 0.
369+
// Returns (migratedPID, wasUpdated) where wasUpdated indicates if the PID was successfully migrated
370+
func (*fileStatusManager) migratePIDFromFile(workloadName string, containerInfo *rt.ContainerInfo) (int, bool) {
371+
// Get the base name from container labels
372+
var baseName string
373+
if containerInfo != nil {
374+
baseName = labels.GetContainerBaseName(containerInfo.Labels)
375+
} else {
376+
// If we don't have container info, try using workload name as base name
377+
baseName = workloadName
378+
}
379+
380+
if baseName == "" {
381+
logger.Debugf("no base name available for workload %s, skipping PID migration", workloadName)
382+
return 0, false
383+
}
384+
385+
// Try to read PID from PID file
386+
// The ReadPIDFile function handles checking both old and new locations
387+
pid, err := process.ReadPIDFile(baseName)
388+
if err != nil {
389+
logger.Debugf("failed to read PID file for workload %s (base name: %s): %v", workloadName, baseName, err)
390+
return 0, false
391+
}
392+
393+
logger.Debugf("found PID %d in PID file for workload %s, will update status file", pid, workloadName)
394+
395+
// TODO: reinstate this once we decide to completely get rid of PID files.
396+
// Delete the PID file after successful migration
397+
/*if err := process.RemovePIDFile(baseName); err != nil {
398+
logger.Warnf("failed to remove PID file for workload %s (base name: %s): %v", workloadName, baseName, err)
399+
// Don't return false here - the migration succeeded, cleanup just failed
400+
}*/
401+
402+
return pid, true
403+
}
404+
349405
// getStatusFilePath returns the file path for a given workload's status file.
350406
func (f *fileStatusManager) getStatusFilePath(workloadName string) string {
351407
return filepath.Join(f.baseDir, fmt.Sprintf("%s.json", workloadName))
@@ -557,6 +613,22 @@ func (f *fileStatusManager) getWorkloadsFromFiles() (map[string]core.Workload, e
557613
workload.Remote = true
558614
}
559615

616+
// Check if PID migration is needed
617+
if statusFile.Status == rt.WorkloadStatusRunning && statusFile.ProcessID == 0 {
618+
// Try PID migration - the migration function will handle cases
619+
// where container info is not available gracefully
620+
if migratedPID, wasMigrated := f.migratePIDFromFile(workloadName, nil); wasMigrated {
621+
// Update the status file with the migrated PID
622+
statusFile.ProcessID = migratedPID
623+
statusFile.UpdatedAt = time.Now()
624+
if err := f.writeStatusFile(statusFilePath, *statusFile); err != nil {
625+
logger.Warnf("failed to write migrated PID for workload %s: %v", workloadName, err)
626+
} else {
627+
logger.Debugf("successfully migrated PID %d to status file for workload %s", migratedPID, workloadName)
628+
}
629+
}
630+
}
631+
560632
workloads[workloadName] = workload
561633
return nil
562634
})

pkg/workloads/statuses/file_status_test.go

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
rtmocks "github.com/stacklok/toolhive/pkg/container/runtime/mocks"
2121
"github.com/stacklok/toolhive/pkg/core"
2222
"github.com/stacklok/toolhive/pkg/logger"
23+
"github.com/stacklok/toolhive/pkg/process"
2324
stateMocks "github.com/stacklok/toolhive/pkg/state/mocks"
2425
)
2526

@@ -1682,3 +1683,232 @@ func TestFileStatusManager_ResetWorkloadPID_WithSlashes(t *testing.T) {
16821683
assert.Equal(t, "started", statusFileData.StatusContext)
16831684
assert.Equal(t, 0, statusFileData.ProcessID) // PID should be reset to 0
16841685
}
1686+
1687+
// TestFileStatusManager_GetWorkload_PIDMigration tests PID migration from legacy PID files to status files
1688+
func TestFileStatusManager_GetWorkload_PIDMigration(t *testing.T) {
1689+
t.Parallel()
1690+
1691+
tests := []struct {
1692+
name string
1693+
setupPIDFile bool
1694+
pidValue int
1695+
workloadStatus rt.WorkloadStatus
1696+
processID int
1697+
expectMigration bool
1698+
expectPIDFile bool // whether PID file should exist after operation
1699+
}{
1700+
{
1701+
name: "migrates PID when status is running and ProcessID is 0",
1702+
setupPIDFile: true,
1703+
pidValue: 12345,
1704+
workloadStatus: rt.WorkloadStatusRunning,
1705+
processID: 0,
1706+
expectMigration: true,
1707+
expectPIDFile: true, // PID file is NOT deleted (see TODO comment in migration code)
1708+
},
1709+
{
1710+
name: "no migration when status is not running",
1711+
setupPIDFile: true,
1712+
pidValue: 12345,
1713+
workloadStatus: rt.WorkloadStatusStopped,
1714+
processID: 0,
1715+
expectMigration: false,
1716+
expectPIDFile: true,
1717+
},
1718+
{
1719+
name: "no migration when ProcessID is not 0",
1720+
setupPIDFile: true,
1721+
pidValue: 12345,
1722+
workloadStatus: rt.WorkloadStatusRunning,
1723+
processID: 98765,
1724+
expectMigration: false,
1725+
expectPIDFile: true,
1726+
},
1727+
{
1728+
name: "no migration when no PID file exists",
1729+
setupPIDFile: false,
1730+
pidValue: 0,
1731+
workloadStatus: rt.WorkloadStatusRunning,
1732+
processID: 0,
1733+
expectMigration: false,
1734+
expectPIDFile: false,
1735+
},
1736+
}
1737+
1738+
for _, tt := range tests {
1739+
t.Run(tt.name, func(t *testing.T) {
1740+
t.Parallel()
1741+
1742+
ctrl := gomock.NewController(t)
1743+
defer ctrl.Finish()
1744+
1745+
manager, mockRuntime, mockRunConfigStore := newTestFileStatusManager(t, ctrl)
1746+
ctx := context.Background()
1747+
workloadName := fmt.Sprintf("test-workload-migration-%d", time.Now().UnixNano()) // unique name to avoid locking conflicts
1748+
1749+
// Mock the run config store to return false for exists (not a remote workload)
1750+
mockRunConfigStore.EXPECT().Exists(gomock.Any(), workloadName).Return(false, nil).AnyTimes()
1751+
mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadName).Return(nil, errors.New("not found")).AnyTimes()
1752+
1753+
// Mock GetWorkloadInfo for runtime validation (when status is running after migration)
1754+
if tt.workloadStatus == rt.WorkloadStatusRunning {
1755+
// Mock the container info that would be returned during validation
1756+
containerInfo := rt.ContainerInfo{
1757+
Name: workloadName,
1758+
Image: "test-image:latest",
1759+
Status: "running",
1760+
State: rt.WorkloadStatusRunning,
1761+
Labels: make(map[string]string),
1762+
}
1763+
mockRuntime.EXPECT().GetWorkloadInfo(gomock.Any(), workloadName).Return(containerInfo, nil).AnyTimes()
1764+
}
1765+
1766+
// Create status file with specified status and ProcessID
1767+
err := manager.setWorkloadStatusInternal(ctx, workloadName, tt.workloadStatus, "test context", &tt.processID)
1768+
require.NoError(t, err)
1769+
1770+
// Setup PID file if needed
1771+
var pidFilePath string
1772+
if tt.setupPIDFile {
1773+
// Create PID file using the process package
1774+
err = process.WritePIDFile(workloadName, tt.pidValue)
1775+
require.NoError(t, err)
1776+
1777+
// Get the path for cleanup verification
1778+
pidFilePath, err = process.GetPIDFilePathWithFallback(workloadName)
1779+
require.NoError(t, err)
1780+
}
1781+
1782+
// Call GetWorkload which should trigger migration if conditions are met
1783+
workload, err := manager.GetWorkload(ctx, workloadName)
1784+
require.NoError(t, err)
1785+
1786+
// Verify workload properties
1787+
assert.Equal(t, workloadName, workload.Name)
1788+
assert.Equal(t, tt.workloadStatus, workload.Status)
1789+
1790+
if tt.expectMigration {
1791+
// Read the status file to verify PID was migrated
1792+
statusFilePath := manager.getStatusFilePath(workloadName)
1793+
data, err := os.ReadFile(statusFilePath)
1794+
require.NoError(t, err)
1795+
1796+
var statusFile workloadStatusFile
1797+
err = json.Unmarshal(data, &statusFile)
1798+
require.NoError(t, err)
1799+
1800+
assert.Equal(t, tt.pidValue, statusFile.ProcessID, "PID should be migrated to status file")
1801+
} else {
1802+
// Read the status file to verify PID was NOT changed
1803+
statusFilePath := manager.getStatusFilePath(workloadName)
1804+
data, err := os.ReadFile(statusFilePath)
1805+
require.NoError(t, err)
1806+
1807+
var statusFile workloadStatusFile
1808+
err = json.Unmarshal(data, &statusFile)
1809+
require.NoError(t, err)
1810+
1811+
assert.Equal(t, tt.processID, statusFile.ProcessID, "PID should remain unchanged")
1812+
}
1813+
1814+
// Verify PID file existence
1815+
if tt.setupPIDFile {
1816+
_, err := os.Stat(pidFilePath)
1817+
if tt.expectPIDFile {
1818+
assert.NoError(t, err, "PID file should still exist")
1819+
} else {
1820+
assert.True(t, os.IsNotExist(err), "PID file should be deleted")
1821+
}
1822+
}
1823+
1824+
// Cleanup
1825+
if tt.setupPIDFile {
1826+
_ = process.RemovePIDFile(workloadName)
1827+
}
1828+
})
1829+
}
1830+
}
1831+
1832+
// TestFileStatusManager_ListWorkloads_PIDMigration tests PID migration during list operations
1833+
func TestFileStatusManager_ListWorkloads_PIDMigration(t *testing.T) {
1834+
t.Parallel()
1835+
1836+
ctrl := gomock.NewController(t)
1837+
defer ctrl.Finish()
1838+
1839+
manager, mockRuntime, mockRunConfigStore := newTestFileStatusManager(t, ctrl)
1840+
ctx := context.Background()
1841+
1842+
// Mock runtime to return empty list (no running containers)
1843+
mockRuntime.EXPECT().ListWorkloads(gomock.Any()).Return([]rt.ContainerInfo{}, nil)
1844+
1845+
// Create two workloads: one that should migrate, one that shouldn't
1846+
workloadMigrate := fmt.Sprintf("workload-migrate-%d", time.Now().UnixNano())
1847+
workloadNoMigrate := fmt.Sprintf("workload-no-migrate-%d", time.Now().UnixNano())
1848+
1849+
// Mock the run config store for both workloads
1850+
mockRunConfigStore.EXPECT().Exists(gomock.Any(), workloadMigrate).Return(false, nil).AnyTimes()
1851+
mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadMigrate).Return(nil, errors.New("not found")).AnyTimes()
1852+
mockRunConfigStore.EXPECT().Exists(gomock.Any(), workloadNoMigrate).Return(false, nil).AnyTimes()
1853+
mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadNoMigrate).Return(nil, errors.New("not found")).AnyTimes()
1854+
1855+
// Setup workload that should trigger migration (running + ProcessID = 0)
1856+
err := manager.setWorkloadStatusInternal(ctx, workloadMigrate, rt.WorkloadStatusRunning, "running", &[]int{0}[0])
1857+
require.NoError(t, err)
1858+
1859+
// Setup workload that shouldn't trigger migration (running + ProcessID != 0)
1860+
existingPID := 54321
1861+
err = manager.setWorkloadStatusInternal(ctx, workloadNoMigrate, rt.WorkloadStatusRunning, "running", &existingPID)
1862+
require.NoError(t, err)
1863+
1864+
// Create PID files for both workloads
1865+
migrationPID := 12345
1866+
err = process.WritePIDFile(workloadMigrate, migrationPID)
1867+
require.NoError(t, err)
1868+
defer process.RemovePIDFile(workloadMigrate)
1869+
1870+
err = process.WritePIDFile(workloadNoMigrate, 99999)
1871+
require.NoError(t, err)
1872+
defer process.RemovePIDFile(workloadNoMigrate)
1873+
1874+
// Call ListWorkloads
1875+
workloads, err := manager.ListWorkloads(ctx, true, nil)
1876+
require.NoError(t, err)
1877+
1878+
// Should have 2 workloads
1879+
require.Len(t, workloads, 2)
1880+
1881+
// Find the workloads in results
1882+
var migrateWorkload, noMigrateWorkload *core.Workload
1883+
for i := range workloads {
1884+
switch workloads[i].Name {
1885+
case workloadMigrate:
1886+
migrateWorkload = &workloads[i]
1887+
case workloadNoMigrate:
1888+
noMigrateWorkload = &workloads[i]
1889+
}
1890+
}
1891+
1892+
require.NotNil(t, migrateWorkload, "should find workload that should migrate")
1893+
require.NotNil(t, noMigrateWorkload, "should find workload that should not migrate")
1894+
1895+
// Verify migration occurred for first workload
1896+
statusFilePath1 := manager.getStatusFilePath(workloadMigrate)
1897+
data1, err := os.ReadFile(statusFilePath1)
1898+
require.NoError(t, err)
1899+
1900+
var statusFile1 workloadStatusFile
1901+
err = json.Unmarshal(data1, &statusFile1)
1902+
require.NoError(t, err)
1903+
assert.Equal(t, migrationPID, statusFile1.ProcessID, "PID should be migrated for first workload")
1904+
1905+
// Verify no migration for second workload
1906+
statusFilePath2 := manager.getStatusFilePath(workloadNoMigrate)
1907+
data2, err := os.ReadFile(statusFilePath2)
1908+
require.NoError(t, err)
1909+
1910+
var statusFile2 workloadStatusFile
1911+
err = json.Unmarshal(data2, &statusFile2)
1912+
require.NoError(t, err)
1913+
assert.Equal(t, existingPID, statusFile2.ProcessID, "PID should remain unchanged for second workload")
1914+
}

0 commit comments

Comments
 (0)