Skip to content

Commit 88d35e4

Browse files
h9jianggopherbot
authored andcommitted
gopls/internal/filewatcher: retry watch registration upon failure
Watcher.watchDir is a blocking method that will try watch registration multiple times with exponential back off until succeed or upon channel closure. Exported method Watcher.WatchDir still call watchDir in a blocking manner to make sure the directory is being watched after the WatchDir returns. Internally, Watcher.run() call watchDir in a non-blocking manner, have higher error tolerance. Errors after all retries failed will be surfaced to the client through the handler call back func. To ensure the promise that handler func will be called sequentially, the error is being handled through an internel error channel, processed by the run() methods. Additional refactoring: - Rename sendEvents to drainEvents so the handler func will be called only inside of run() function body. More clear to see the handler will be called sequentially. - Rename field knownDir to dirCancel, the dirCancel maps from a path to a closing channel that watch registration go routine is listening to. For golang/go#74292 Change-Id: I286531f215b52495fbe2dc11b028c79a37cf3d5f Reviewed-on: https://go-review.googlesource.com/c/tools/+/688215 LUCI-TryBot-Result: Go LUCI <[email protected]> Auto-Submit: Hongxiang Jiang <[email protected]> Reviewed-by: Alan Donovan <[email protected]>
1 parent 89ab5e4 commit 88d35e4

File tree

3 files changed

+285
-99
lines changed

3 files changed

+285
-99
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package filewatcher
6+
7+
// This file defines things (and opens backdoors) needed only by tests.
8+
9+
// SetAfterAddHook sets a hook to be called after a path is added to the watcher.
10+
// This is used in tests to inspect the error returned by the underlying watcher.
11+
func SetAfterAddHook(f func(string, error)) {
12+
afterAddHook = f
13+
}

gopls/internal/filewatcher/filewatcher.go

Lines changed: 156 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package filewatcher
66

77
import (
8+
"errors"
89
"io/fs"
910
"log/slog"
1011
"os"
@@ -17,22 +18,33 @@ import (
1718
"golang.org/x/tools/gopls/internal/protocol"
1819
)
1920

21+
// ErrClosed is used when trying to operate on a closed Watcher.
22+
var ErrClosed = errors.New("file watcher: watcher already closed")
23+
2024
// Watcher collects events from a [fsnotify.Watcher] and converts them into
2125
// batched LSP [protocol.FileEvent]s.
2226
type Watcher struct {
2327
logger *slog.Logger
2428

2529
stop chan struct{} // closed by Close to terminate run loop
2630

27-
wg sync.WaitGroup // counts number of active run goroutines (max 1)
31+
// errs is an internal channel for surfacing errors from the file watcher,
32+
// distinct from the fsnotify watcher's error channel.
33+
errs chan error
34+
35+
// watchers counts the number of active watch registration goroutines,
36+
// including their error handling.
37+
watchers sync.WaitGroup
38+
runners sync.WaitGroup // counts the number of active run goroutines (max 1)
2839

2940
watcher *fsnotify.Watcher
3041

3142
mu sync.Mutex // guards all fields below
3243

33-
// knownDirs tracks all known directories to help distinguish between file
34-
// and directory deletion events.
35-
knownDirs map[string]bool
44+
// dirCancel maps a directory path to its cancellation channel.
45+
// A nil map indicates the watcher is closing and prevents new directory
46+
// watch registrations.
47+
dirCancel map[string]chan struct{}
3648

3749
// events is the current batch of unsent file events, which will be sent
3850
// when the timer expires.
@@ -53,11 +65,12 @@ func New(delay time.Duration, logger *slog.Logger, handler func([]protocol.FileE
5365
w := &Watcher{
5466
logger: logger,
5567
watcher: watcher,
56-
knownDirs: make(map[string]bool),
68+
dirCancel: make(map[string]chan struct{}),
69+
errs: make(chan error),
5770
stop: make(chan struct{}),
5871
}
5972

60-
w.wg.Add(1)
73+
w.runners.Add(1)
6174
go w.run(delay, handler)
6275

6376
return w, nil
@@ -66,7 +79,7 @@ func New(delay time.Duration, logger *slog.Logger, handler func([]protocol.FileE
6679
// run is the main event-handling loop for the watcher. It should be run in a
6780
// separate goroutine.
6881
func (w *Watcher) run(delay time.Duration, handler func([]protocol.FileEvent, error)) {
69-
defer w.wg.Done()
82+
defer w.runners.Done()
7083

7184
// timer is used to debounce events.
7285
timer := time.NewTimer(delay)
@@ -78,13 +91,23 @@ func (w *Watcher) run(delay time.Duration, handler func([]protocol.FileEvent, er
7891
return
7992

8093
case <-timer.C:
81-
w.sendEvents(handler)
94+
if events := w.drainEvents(); len(events) > 0 {
95+
handler(events, nil)
96+
}
8297
timer.Reset(delay)
8398

8499
case err, ok := <-w.watcher.Errors:
85100
// When the watcher is closed, its Errors channel is closed, which
86101
// unblocks this case. We continue to the next loop iteration,
87-
// allowing the <-w.closed case to handle the shutdown.
102+
// allowing the <-w.stop case to handle the shutdown.
103+
if !ok {
104+
continue
105+
}
106+
if err != nil {
107+
handler(nil, err)
108+
}
109+
110+
case err, ok := <-w.errs:
88111
if !ok {
89112
continue
90113
}
@@ -96,10 +119,9 @@ func (w *Watcher) run(delay time.Duration, handler func([]protocol.FileEvent, er
96119
if !ok {
97120
continue
98121
}
99-
// file watcher should not handle the fsnotify.Event concurrently,
100-
// the original order should be preserved. E.g. if a file get
101-
// deleted and recreated, running concurrently may result it in
102-
// reverse order.
122+
// fsnotify.Event should not be handled concurrently, to preserve their
123+
// original order. For example, if a file is deleted and recreated,
124+
// concurrent handling could process the events in reverse order.
103125
//
104126
// Only reset the timer if a relevant event happened.
105127
// https://github.com/fsnotify/fsnotify?tab=readme-ov-file#why-do-i-get-many-chmod-events
@@ -134,10 +156,17 @@ func (w *Watcher) WatchDir(path string) error {
134156
if skipDir(d.Name()) {
135157
return filepath.SkipDir
136158
}
137-
w.addKnownDir(path)
138-
if err := w.watchDir(path); err != nil {
139-
// TODO(hxjiang): retry on watch failures.
140-
return filepath.SkipDir
159+
160+
done := w.addWatchHandle(path)
161+
if done == nil { // file watcher closing
162+
return filepath.SkipAll
163+
}
164+
165+
errChan := make(chan error, 1)
166+
w.watchDir(path, done, errChan)
167+
168+
if err := <-errChan; err != nil {
169+
return err
141170
}
142171
}
143172
return nil
@@ -159,8 +188,8 @@ func (w *Watcher) handleEvent(event fsnotify.Event) *protocol.FileEvent {
159188
} else if os.IsNotExist(err) {
160189
// Upon deletion, the file/dir has been removed. fsnotify
161190
// does not provide information regarding the deleted item.
162-
// Use the watchedDirs to determine whether it's a dir.
163-
isDir = w.isKnownDir(path)
191+
// Use watchHandles to determine if the deleted item was a directory.
192+
isDir = w.isWatchedDir(path)
164193
} else {
165194
// If statting failed, something is wrong with the file system.
166195
// Log and move on.
@@ -184,26 +213,25 @@ func (w *Watcher) handleEvent(event fsnotify.Event) *protocol.FileEvent {
184213
fallthrough
185214
case event.Op.Has(fsnotify.Remove):
186215
// Upon removal, we only need to remove the entries from the map.
187-
// The [fsnotify.Watcher] remove the watch for us.
216+
// The [fsnotify.Watcher] removes the watch for us.
188217
// fsnotify/fsnotify#268
189-
w.removeKnownDir(path)
218+
w.removeWatchHandle(path)
190219

191220
// TODO(hxjiang): Directory removal events from some LSP clients may
192221
// not include corresponding removal events for child files and
193-
// subdirectories. Should we do some filtering when add the dir
222+
// subdirectories. Should we do some filtering when adding the dir
194223
// deletion event to the events slice.
195224
return &protocol.FileEvent{
196225
URI: protocol.URIFromPath(path),
197226
Type: protocol.Deleted,
198227
}
199228
case event.Op.Has(fsnotify.Create):
200-
w.addKnownDir(path)
201-
202-
// This watch is added asynchronously to prevent a potential deadlock
203-
// on Windows. The fsnotify library can block when registering a watch
204-
// if its event channel is full (see fsnotify/fsnotify#502).
205-
// TODO(hxjiang): retry on watch failure.
206-
go w.watchDir(path)
229+
// This watch is added asynchronously to prevent a potential
230+
// deadlock on Windows. See fsnotify/fsnotify#502.
231+
// Error encountered will be sent to internal error channel.
232+
if done := w.addWatchHandle(path); done != nil {
233+
go w.watchDir(path, done, w.errs)
234+
}
207235

208236
return &protocol.FileEvent{
209237
URI: protocol.URIFromPath(path),
@@ -244,10 +272,19 @@ func (w *Watcher) handleEvent(event fsnotify.Event) *protocol.FileEvent {
244272
}
245273
}
246274

247-
// watchDir register the watch for the input dir. This function may be blocking
248-
// because of the issue fsnotify/fsnotify#502.
249-
func (w *Watcher) watchDir(path string) error {
250-
// Dir with broken symbolic link can not be watched.
275+
// watchDir registers a watch for a directory, retrying with backoff if it fails.
276+
// It can be canceled by calling removeWatchHandle. On success or cancellation,
277+
// nil is sent to 'errChan'; otherwise, the last error after all retries is sent.
278+
func (w *Watcher) watchDir(path string, done chan struct{}, errChan chan error) {
279+
if errChan == nil {
280+
panic("input error chan is nil")
281+
}
282+
283+
// On darwin, watching a directory will fail if it contains broken symbolic
284+
// links. This state can occur temporarily during operations like a git
285+
// branch switch. To handle this, we retry multiple times with exponential
286+
// backoff, allowing time for the symbolic link's target to be created.
287+
251288
// TODO(hxjiang): Address a race condition where file or directory creations
252289
// under current directory might be missed between the current directory
253290
// creation and the establishment of the file watch.
@@ -256,26 +293,89 @@ func (w *Watcher) watchDir(path string) error {
256293
// 1. Retrospectively check for and trigger creation events for any new
257294
// files/directories.
258295
// 2. Recursively add watches for any newly created subdirectories.
259-
return w.watcher.Add(path)
296+
var (
297+
delay = 500 * time.Millisecond
298+
err error
299+
)
300+
301+
// Watchers wait group becomes done only after errChan send.
302+
w.watchers.Add(1)
303+
defer func() {
304+
errChan <- err
305+
w.watchers.Done()
306+
}()
307+
308+
for i := range 5 {
309+
if i > 0 {
310+
select {
311+
case <-time.After(delay):
312+
delay *= 2
313+
case <-done:
314+
return // cancelled
315+
}
316+
}
317+
// This function may block due to fsnotify/fsnotify#502.
318+
err := w.watcher.Add(path)
319+
if afterAddHook != nil {
320+
afterAddHook(path, err)
321+
}
322+
if err == nil {
323+
return
324+
}
325+
}
260326
}
261327

262-
func (w *Watcher) addKnownDir(path string) {
328+
var afterAddHook func(path string, err error)
329+
330+
// addWatchHandle registers a new directory watch.
331+
// The returned 'done' channel channel should be used to signal cancellation of
332+
// a pending watch.
333+
// It returns nil if the watcher is already closing.
334+
func (w *Watcher) addWatchHandle(path string) chan struct{} {
263335
w.mu.Lock()
264336
defer w.mu.Unlock()
265-
w.knownDirs[path] = true
337+
338+
if w.dirCancel == nil { // file watcher is closing.
339+
return nil
340+
}
341+
342+
done := make(chan struct{})
343+
w.dirCancel[path] = done
344+
return done
266345
}
267346

268-
func (w *Watcher) removeKnownDir(path string) {
347+
// removeWatchHandle removes the handle for a directory watch and cancels any
348+
// pending watch attempt for that path.
349+
func (w *Watcher) removeWatchHandle(path string) {
269350
w.mu.Lock()
270351
defer w.mu.Unlock()
271-
delete(w.knownDirs, path)
352+
353+
if done, ok := w.dirCancel[path]; ok {
354+
delete(w.dirCancel, path)
355+
close(done)
356+
}
357+
}
358+
359+
// close removes all handles and cancels all pending watch attempt for that path
360+
// and set dirCancel to nil which prevent any future watch attempts.
361+
func (w *Watcher) close() {
362+
w.mu.Lock()
363+
dirCancel := w.dirCancel
364+
w.dirCancel = nil
365+
w.mu.Unlock()
366+
367+
for _, ch := range dirCancel {
368+
close(ch)
369+
}
272370
}
273371

274-
func (w *Watcher) isKnownDir(path string) bool {
372+
// isWatchedDir reports whether the given path has a watch handle, meaning it is
373+
// a directory the watcher is managing.
374+
func (w *Watcher) isWatchedDir(path string) bool {
275375
w.mu.Lock()
276376
defer w.mu.Unlock()
277377

278-
_, isDir := w.knownDirs[path]
378+
_, isDir := w.dirCancel[path]
279379
return isDir
280380
}
281381

@@ -297,27 +397,35 @@ func (w *Watcher) addEvent(event protocol.FileEvent) {
297397
}
298398
}
299399

300-
func (w *Watcher) sendEvents(handler func([]protocol.FileEvent, error)) {
400+
func (w *Watcher) drainEvents() []protocol.FileEvent {
301401
w.mu.Lock()
302402
events := w.events
303403
w.events = nil
304404
w.mu.Unlock()
305405

306-
if len(events) != 0 {
307-
handler(events, nil)
308-
}
406+
return events
309407
}
310408

311409
// Close shuts down the watcher, waits for the internal goroutine to terminate,
312410
// and returns any final error.
313411
func (w *Watcher) Close() error {
412+
// Cancel any ongoing watch registration.
413+
w.close()
414+
415+
// Wait for all watch registration goroutines to finish, including their
416+
// error handling. This ensures that:
417+
// - All [Watcher.watchDir] goroutines have exited and sent their errors, so
418+
// it is safe to close the internal error channel.
419+
// - There are no ongoing [fsnotify.Watcher.Add] calls, so it is safe to
420+
// close the fsnotify watcher (see fsnotify/fsnotify#704).
421+
w.watchers.Wait()
422+
close(w.errs)
423+
314424
err := w.watcher.Close()
315425

316-
// Wait for the go routine to finish. So all the channels will be closed and
317-
// all go routine will be terminated.
426+
// Wait for the main run loop to terminate.
318427
close(w.stop)
319-
320-
w.wg.Wait()
428+
w.runners.Wait()
321429

322430
return err
323431
}

0 commit comments

Comments
 (0)