Skip to content

Commit be09da5

Browse files
authored
lockedfile: use a retry loop to suppress EDEADLK on AIX and Solaris (#92)
Apply upstream CL 222277 per @bcmills Automatically applied via: (command cd /path/to/go; git diff c6f678b6efd6622d335e6d4b659282bb2d16f5ba~1 c6f678b6efd6622d335e6d4b659282bb2d16f5ba) | git apply -p5 with a minor fix to then use github.com/rogpeppe/go-internal/testenv instead of internal/testenv Fixes #91
1 parent a7f0f53 commit be09da5

File tree

2 files changed

+164
-5
lines changed

2 files changed

+164
-5
lines changed

lockedfile/internal/filelock/filelock_fcntl.go

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,19 @@
1313
// or an F_OFD_SETLK command for 'fcntl', that allows for better concurrency and
1414
// does not require per-inode bookkeeping in the application.
1515
//
16-
// TODO(bcmills): If we add a build tag for Illumos (see golang.org/issue/20603)
17-
// then Illumos should use F_OFD_SETLK, and the resulting code would be as
18-
// simple as filelock_unix.go. We will still need the code in this file for AIX
19-
// or as long as Oracle Solaris provides only F_SETLK.
16+
// TODO(golang.org/issue/35618): add a syscall.Flock binding for Illumos and
17+
// switch it over to use filelock_unix.go.
2018

2119
package filelock
2220

2321
import (
2422
"errors"
2523
"io"
24+
"math/rand"
2625
"os"
2726
"sync"
2827
"syscall"
28+
"time"
2929
)
3030

3131
type lockType int16
@@ -91,7 +91,67 @@ func lock(f File, lt lockType) (err error) {
9191
wait <- f
9292
}
9393

94-
err = setlkw(f.Fd(), lt)
94+
// Spurious EDEADLK errors arise on platforms that compute deadlock graphs at
95+
// the process, rather than thread, level. Consider processes P and Q, with
96+
// threads P.1, P.2, and Q.3. The following trace is NOT a deadlock, but will be
97+
// reported as a deadlock on systems that consider only process granularity:
98+
//
99+
// P.1 locks file A.
100+
// Q.3 locks file B.
101+
// Q.3 blocks on file A.
102+
// P.2 blocks on file B. (This is erroneously reported as a deadlock.)
103+
// P.1 unlocks file A.
104+
// Q.3 unblocks and locks file A.
105+
// Q.3 unlocks files A and B.
106+
// P.2 unblocks and locks file B.
107+
// P.2 unlocks file B.
108+
//
109+
// These spurious errors were observed in practice on AIX and Solaris in
110+
// cmd/go: see https://golang.org/issue/32817.
111+
//
112+
// We work around this bug by treating EDEADLK as always spurious. If there
113+
// really is a lock-ordering bug between the interacting processes, it will
114+
// become a livelock instead, but that's not appreciably worse than if we had
115+
// a proper flock implementation (which generally does not even attempt to
116+
// diagnose deadlocks).
117+
//
118+
// In the above example, that changes the trace to:
119+
//
120+
// P.1 locks file A.
121+
// Q.3 locks file B.
122+
// Q.3 blocks on file A.
123+
// P.2 spuriously fails to lock file B and goes to sleep.
124+
// P.1 unlocks file A.
125+
// Q.3 unblocks and locks file A.
126+
// Q.3 unlocks files A and B.
127+
// P.2 wakes up and locks file B.
128+
// P.2 unlocks file B.
129+
//
130+
// We know that the retry loop will not introduce a *spurious* livelock
131+
// because, according to the POSIX specification, EDEADLK is only to be
132+
// returned when “the lock is blocked by a lock from another process”.
133+
// If that process is blocked on some lock that we are holding, then the
134+
// resulting livelock is due to a real deadlock (and would manifest as such
135+
// when using, for example, the flock implementation of this package).
136+
// If the other process is *not* blocked on some other lock that we are
137+
// holding, then it will eventually release the requested lock.
138+
139+
nextSleep := 1 * time.Millisecond
140+
const maxSleep = 500 * time.Millisecond
141+
for {
142+
err = setlkw(f.Fd(), lt)
143+
if err != syscall.EDEADLK {
144+
break
145+
}
146+
time.Sleep(nextSleep)
147+
148+
nextSleep += nextSleep
149+
if nextSleep > maxSleep {
150+
nextSleep = maxSleep
151+
}
152+
// Apply 10% jitter to avoid synchronizing collisions when we finally unblock.
153+
nextSleep += time.Duration((0.1*rand.Float64() - 0.05) * float64(nextSleep))
154+
}
95155

96156
if err != nil {
97157
unlock(f)

lockedfile/lockedfile_test.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@
88
package lockedfile_test
99

1010
import (
11+
"fmt"
1112
"io/ioutil"
1213
"os"
14+
"os/exec"
1315
"path/filepath"
1416
"testing"
1517
"time"
1618

19+
"github.com/rogpeppe/go-internal/testenv"
20+
1721
"github.com/rogpeppe/go-internal/lockedfile"
1822
)
1923

@@ -172,3 +176,98 @@ func TestCanLockExistingFile(t *testing.T) {
172176
f.Close()
173177
wait(t)
174178
}
179+
180+
// TestSpuriousEDEADLK verifies that the spurious EDEADLK reported in
181+
// https://golang.org/issue/32817 no longer occurs.
182+
func TestSpuriousEDEADLK(t *testing.T) {
183+
// P.1 locks file A.
184+
// Q.3 locks file B.
185+
// Q.3 blocks on file A.
186+
// P.2 blocks on file B. (Spurious EDEADLK occurs here.)
187+
// P.1 unlocks file A.
188+
// Q.3 unblocks and locks file A.
189+
// Q.3 unlocks files A and B.
190+
// P.2 unblocks and locks file B.
191+
// P.2 unlocks file B.
192+
193+
testenv.MustHaveExec(t)
194+
195+
dirVar := t.Name() + "DIR"
196+
197+
if dir := os.Getenv(dirVar); dir != "" {
198+
// Q.3 locks file B.
199+
b, err := lockedfile.Edit(filepath.Join(dir, "B"))
200+
if err != nil {
201+
t.Fatal(err)
202+
}
203+
defer b.Close()
204+
205+
if err := ioutil.WriteFile(filepath.Join(dir, "locked"), []byte("ok"), 0666); err != nil {
206+
t.Fatal(err)
207+
}
208+
209+
// Q.3 blocks on file A.
210+
a, err := lockedfile.Edit(filepath.Join(dir, "A"))
211+
// Q.3 unblocks and locks file A.
212+
if err != nil {
213+
t.Fatal(err)
214+
}
215+
defer a.Close()
216+
217+
// Q.3 unlocks files A and B.
218+
return
219+
}
220+
221+
dir, remove := mustTempDir(t)
222+
defer remove()
223+
224+
// P.1 locks file A.
225+
a, err := lockedfile.Edit(filepath.Join(dir, "A"))
226+
if err != nil {
227+
t.Fatal(err)
228+
}
229+
230+
cmd := exec.Command(os.Args[0], "-test.run="+t.Name())
231+
cmd.Env = append(os.Environ(), fmt.Sprintf("%s=%s", dirVar, dir))
232+
233+
qDone := make(chan struct{})
234+
waitQ := mustBlock(t, "Edit A and B in subprocess", func() {
235+
out, err := cmd.CombinedOutput()
236+
if err != nil {
237+
t.Errorf("%v:\n%s", err, out)
238+
}
239+
close(qDone)
240+
})
241+
242+
// Wait until process Q has either failed or locked file B.
243+
// Otherwise, P.2 might not block on file B as intended.
244+
locked:
245+
for {
246+
if _, err := os.Stat(filepath.Join(dir, "locked")); !os.IsNotExist(err) {
247+
break locked
248+
}
249+
select {
250+
case <-qDone:
251+
break locked
252+
case <-time.After(1 * time.Millisecond):
253+
}
254+
}
255+
256+
waitP2 := mustBlock(t, "Edit B", func() {
257+
// P.2 blocks on file B. (Spurious EDEADLK occurs here.)
258+
b, err := lockedfile.Edit(filepath.Join(dir, "B"))
259+
// P.2 unblocks and locks file B.
260+
if err != nil {
261+
t.Error(err)
262+
return
263+
}
264+
// P.2 unlocks file B.
265+
b.Close()
266+
})
267+
268+
// P.1 unlocks file A.
269+
a.Close()
270+
271+
waitQ(t)
272+
waitP2(t)
273+
}

0 commit comments

Comments
 (0)