Skip to content

Commit d477b72

Browse files
authored
Make lockfile cleanup more resilient (#1722)
* Config lock files were not being cleaned up. * Ensure lock files get cleaned up as part of signal handling. * Ensure stale lock files get cleaned up.
1 parent 28001e4 commit d477b72

File tree

7 files changed

+665
-34
lines changed

7 files changed

+665
-34
lines changed

cmd/thv/main.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,16 @@ package main
33

44
import (
55
"os"
6+
"os/signal"
7+
"syscall"
8+
"time"
9+
10+
"github.com/adrg/xdg"
611

712
"github.com/stacklok/toolhive/cmd/thv/app"
813
"github.com/stacklok/toolhive/pkg/client"
914
"github.com/stacklok/toolhive/pkg/container/runtime"
15+
"github.com/stacklok/toolhive/pkg/lockfile"
1016
"github.com/stacklok/toolhive/pkg/logger"
1117
"github.com/stacklok/toolhive/pkg/migration"
1218
)
@@ -15,6 +21,12 @@ func main() {
1521
// Initialize the logger
1622
logger.Initialize()
1723

24+
// Setup signal handling for graceful cleanup
25+
setupSignalHandler()
26+
27+
// Clean up stale lock files on startup
28+
cleanupStaleLockFiles()
29+
1830
// Check and perform auto-discovery migration if needed
1931
// Handles the auto-discovery flag depreciation, only executes once on old config files
2032
client.CheckAndPerformAutoDiscoveryMigration()
@@ -25,6 +37,48 @@ func main() {
2537

2638
// Skip update check for completion command or if we are running in kubernetes
2739
if err := app.NewRootCmd(!app.IsCompletionCommand(os.Args) && !runtime.IsKubernetesRuntime()).Execute(); err != nil {
40+
// Clean up any remaining lock files on error exit
41+
lockfile.CleanupAllLocks()
2842
os.Exit(1)
2943
}
44+
45+
// Clean up lock files on normal exit
46+
lockfile.CleanupAllLocks()
47+
}
48+
49+
// setupSignalHandler configures signal handling to ensure lock files are cleaned up
50+
func setupSignalHandler() {
51+
sigCh := make(chan os.Signal, 1)
52+
signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
53+
54+
go func() {
55+
<-sigCh
56+
logger.Debugf("Received signal, cleaning up lock files...")
57+
lockfile.CleanupAllLocks()
58+
os.Exit(0)
59+
}()
60+
}
61+
62+
// cleanupStaleLockFiles removes stale lock files from known directories on startup
63+
func cleanupStaleLockFiles() {
64+
// Common directories where lock files are created
65+
var directories []string
66+
67+
// Config directory
68+
if configDir, err := xdg.ConfigFile("toolhive"); err == nil {
69+
directories = append(directories, configDir)
70+
}
71+
72+
// Data directory (for statuses and updates)
73+
if dataDir, err := xdg.DataFile("toolhive"); err == nil {
74+
directories = append(directories, dataDir)
75+
76+
// Specific subdirectories
77+
if statusDir, err := xdg.DataFile("toolhive/statuses"); err == nil {
78+
directories = append(directories, statusDir)
79+
}
80+
}
81+
82+
// Clean up lock files older than 5 minutes (should be safe for most operations)
83+
lockfile.CleanupStaleLocks(directories, 5*time.Minute)
3084
}

pkg/client/config_editor.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ import (
88
"strings"
99
"time"
1010

11-
"github.com/gofrs/flock"
1211
"github.com/tailscale/hujson"
1312
"github.com/tidwall/gjson"
1413
"gopkg.in/yaml.v3"
1514

15+
"github.com/stacklok/toolhive/pkg/lockfile"
1616
"github.com/stacklok/toolhive/pkg/logger"
1717
)
1818

@@ -39,7 +39,8 @@ type JSONConfigUpdater struct {
3939
// Upsert inserts or updates an MCP server in the MCP client config file
4040
func (jcu *JSONConfigUpdater) Upsert(serverName string, data MCPServer) error {
4141
// Create a lock file
42-
fileLock := flock.New(jcu.Path + ".lock")
42+
lockPath := jcu.Path + ".lock"
43+
fileLock := lockfile.NewTrackedLock(lockPath)
4344

4445
// Create a context with timeout
4546
ctx, cancel := context.WithTimeout(context.Background(), lockTimeout)
@@ -53,7 +54,7 @@ func (jcu *JSONConfigUpdater) Upsert(serverName string, data MCPServer) error {
5354
if !locked {
5455
return fmt.Errorf("failed to acquire lock: timeout after %v", lockTimeout)
5556
}
56-
defer fileLock.Unlock()
57+
defer lockfile.ReleaseTrackedLock(lockPath, fileLock)
5758

5859
content, err := os.ReadFile(jcu.Path)
5960
if err != nil {
@@ -98,7 +99,8 @@ func (jcu *JSONConfigUpdater) Upsert(serverName string, data MCPServer) error {
9899
// Remove removes an MCP server from the MCP client config file
99100
func (jcu *JSONConfigUpdater) Remove(serverName string) error {
100101
// Create a lock file
101-
fileLock := flock.New(jcu.Path + ".lock")
102+
lockPath := jcu.Path + ".lock"
103+
fileLock := lockfile.NewTrackedLock(lockPath)
102104

103105
// Create a context with timeout
104106
ctx, cancel := context.WithTimeout(context.Background(), lockTimeout)
@@ -112,7 +114,7 @@ func (jcu *JSONConfigUpdater) Remove(serverName string) error {
112114
if !locked {
113115
return fmt.Errorf("failed to acquire lock: timeout after %v", lockTimeout)
114116
}
115-
defer fileLock.Unlock()
117+
defer lockfile.ReleaseTrackedLock(lockPath, fileLock)
116118

117119
content, err := os.ReadFile(jcu.Path)
118120
if err != nil {
@@ -162,7 +164,8 @@ type YAMLConfigUpdater struct {
162164
// Upsert inserts or updates an MCP server in the config.yaml file using the converter
163165
func (ycu *YAMLConfigUpdater) Upsert(serverName string, data MCPServer) error {
164166
// Create a lock file
165-
fileLock := flock.New(ycu.Path + ".lock")
167+
lockPath := ycu.Path + ".lock"
168+
fileLock := lockfile.NewTrackedLock(lockPath)
166169

167170
// Create a context with timeout
168171
ctx, cancel := context.WithTimeout(context.Background(), lockTimeout)
@@ -176,7 +179,7 @@ func (ycu *YAMLConfigUpdater) Upsert(serverName string, data MCPServer) error {
176179
if !locked {
177180
return fmt.Errorf("failed to acquire lock: timeout after %v", lockTimeout)
178181
}
179-
defer fileLock.Unlock()
182+
defer lockfile.ReleaseTrackedLock(lockPath, fileLock)
180183

181184
content, err := os.ReadFile(ycu.Path)
182185
if err != nil && !os.IsNotExist(err) {
@@ -227,7 +230,8 @@ func (ycu *YAMLConfigUpdater) Upsert(serverName string, data MCPServer) error {
227230
// Remove removes an entry from the config.yaml file using the converter
228231
func (ycu *YAMLConfigUpdater) Remove(serverName string) error {
229232
// Create a lock file
230-
fileLock := flock.New(ycu.Path + ".lock")
233+
lockPath := ycu.Path + ".lock"
234+
fileLock := lockfile.NewTrackedLock(lockPath)
231235

232236
ctx, cancel := context.WithTimeout(context.Background(), lockTimeout)
233237
defer cancel()
@@ -240,7 +244,7 @@ func (ycu *YAMLConfigUpdater) Remove(serverName string) error {
240244
if !locked {
241245
return fmt.Errorf("failed to acquire lock: timeout after %v", lockTimeout)
242246
}
243-
defer fileLock.Unlock()
247+
defer lockfile.ReleaseTrackedLock(lockPath, fileLock)
244248

245249
// Read existing config
246250
content, err := os.ReadFile(ycu.Path)

pkg/config/config.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ import (
1111
"time"
1212

1313
"github.com/adrg/xdg"
14-
"github.com/gofrs/flock"
1514
"gopkg.in/yaml.v3"
1615

1716
"github.com/stacklok/toolhive/pkg/env"
17+
"github.com/stacklok/toolhive/pkg/lockfile"
1818
"github.com/stacklok/toolhive/pkg/logger"
1919
"github.com/stacklok/toolhive/pkg/secrets"
2020
)
@@ -265,7 +265,7 @@ func UpdateConfigAtPath(configPath string, updateFn func(*Config)) error {
265265

266266
// Use a separate lock file for cross-platform compatibility
267267
lockPath := configPath + ".lock"
268-
fileLock := flock.New(lockPath)
268+
fileLock := lockfile.NewTrackedLock(lockPath)
269269
ctx, cancel := context.WithTimeout(context.Background(), lockTimeout)
270270
defer cancel()
271271

@@ -277,7 +277,7 @@ func UpdateConfigAtPath(configPath string, updateFn func(*Config)) error {
277277
if !locked {
278278
return fmt.Errorf("failed to acquire lock: timeout after %v", lockTimeout)
279279
}
280-
defer fileLock.Unlock()
280+
defer lockfile.ReleaseTrackedLock(lockPath, fileLock)
281281

282282
// Load the config after acquiring the lock to avoid race conditions
283283
c, err := LoadOrCreateConfigWithPath(configPath)

pkg/lockfile/cleanup.go

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Package lockfile provides utilities for managing file locks and cleanup.
2+
package lockfile
3+
4+
import (
5+
"os"
6+
"path/filepath"
7+
"sync"
8+
"time"
9+
10+
"github.com/gofrs/flock"
11+
12+
"github.com/stacklok/toolhive/pkg/logger"
13+
)
14+
15+
var (
16+
// globalRegistry holds all active lock files for cleanup
17+
globalRegistry = &lockRegistry{
18+
locks: make(map[string]*flock.Flock),
19+
}
20+
)
21+
22+
// lockRegistry manages active file locks for cleanup purposes
23+
type lockRegistry struct {
24+
mu sync.RWMutex
25+
locks map[string]*flock.Flock
26+
}
27+
28+
// RegisterLock adds a lock to the global registry for cleanup
29+
func (lr *lockRegistry) RegisterLock(lockPath string, lock *flock.Flock) {
30+
lr.mu.Lock()
31+
defer lr.mu.Unlock()
32+
lr.locks[lockPath] = lock
33+
}
34+
35+
// UnregisterLock removes a lock from the global registry
36+
func (lr *lockRegistry) UnregisterLock(lockPath string) {
37+
lr.mu.Lock()
38+
defer lr.mu.Unlock()
39+
delete(lr.locks, lockPath)
40+
}
41+
42+
// CleanupAll unlocks and removes all registered lock files
43+
func (lr *lockRegistry) CleanupAll() {
44+
lr.mu.Lock()
45+
defer lr.mu.Unlock()
46+
47+
for lockPath, lock := range lr.locks {
48+
if err := lock.Unlock(); err != nil && !os.IsNotExist(err) {
49+
logger.Warnf("failed to unlock file %s: %v", lockPath, err)
50+
}
51+
52+
if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
53+
logger.Warnf("failed to remove lock file %s: %v", lockPath, err)
54+
}
55+
}
56+
57+
// Clear the registry
58+
lr.locks = make(map[string]*flock.Flock)
59+
}
60+
61+
// NewTrackedLock creates a new file lock and registers it for cleanup
62+
func NewTrackedLock(lockPath string) *flock.Flock {
63+
lock := flock.New(lockPath)
64+
globalRegistry.RegisterLock(lockPath, lock)
65+
return lock
66+
}
67+
68+
// ReleaseTrackedLock unlocks, removes, and unregisters a lock file
69+
func ReleaseTrackedLock(lockPath string, lock *flock.Flock) {
70+
if err := lock.Unlock(); err != nil && !os.IsNotExist(err) {
71+
logger.Warnf("failed to unlock file %s: %v", lockPath, err)
72+
}
73+
74+
if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
75+
logger.Warnf("failed to remove lock file %s: %v", lockPath, err)
76+
}
77+
78+
globalRegistry.UnregisterLock(lockPath)
79+
}
80+
81+
// CleanupAllLocks provides global cleanup of all registered lock files
82+
func CleanupAllLocks() {
83+
globalRegistry.CleanupAll()
84+
}
85+
86+
// CleanupStaleLocks removes stale lock files from the specified directories
87+
// A lock file is considered stale if it's older than the maxAge duration
88+
func CleanupStaleLocks(directories []string, maxAge time.Duration) {
89+
cutoff := time.Now().Add(-maxAge)
90+
91+
for _, dir := range directories {
92+
matches, err := filepath.Glob(filepath.Join(dir, "*.lock"))
93+
if err != nil {
94+
logger.Warnf("failed to glob lock files in %s: %v", dir, err)
95+
continue
96+
}
97+
98+
for _, lockFile := range matches {
99+
info, err := os.Stat(lockFile)
100+
if err != nil {
101+
continue // File may have been removed already
102+
}
103+
104+
if info.ModTime().Before(cutoff) {
105+
// Try to acquire the lock to check if it's really stale
106+
lock := flock.New(lockFile)
107+
if locked, err := lock.TryLock(); err == nil && locked {
108+
// Lock was acquired, so it was stale
109+
if err := lock.Unlock(); err != nil && !os.IsNotExist(err) {
110+
logger.Warnf("failed to unlock stale lock file %s: %v", lockFile, err)
111+
}
112+
if err := os.Remove(lockFile); err != nil && !os.IsNotExist(err) {
113+
logger.Warnf("failed to remove stale lock file %s: %v", lockFile, err)
114+
} else {
115+
logger.Debugf("removed stale lock file: %s", lockFile)
116+
}
117+
}
118+
}
119+
}
120+
}
121+
}

0 commit comments

Comments
 (0)