Skip to content

Commit 6e90393

Browse files
Fix parent vs child race (#1141)
1 parent 0b80b2d commit 6e90393

File tree

2 files changed

+105
-9
lines changed

2 files changed

+105
-9
lines changed

internal/context.go

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -223,14 +223,19 @@ func propagateCancel(parent Context, child canceler) {
223223
return // parent is never canceled
224224
}
225225
if p, ok := parentCancelCtx(parent); ok {
226+
p.cancelLock.Lock()
226227
if p.err != nil {
228+
p.cancelLock.Unlock()
227229
// parent has already been canceled
228230
child.cancel(false, p.err)
229231
} else {
232+
p.childrenLock.Lock()
230233
if p.children == nil {
231234
p.children = make(map[canceler]bool)
232235
}
233236
p.children[child] = true
237+
p.childrenLock.Unlock()
238+
p.cancelLock.Unlock()
234239
}
235240
} else {
236241
go func() {
@@ -269,6 +274,9 @@ func removeChild(parent Context, child canceler) {
269274
if !ok {
270275
return
271276
}
277+
278+
p.childrenLock.Lock()
279+
defer p.childrenLock.Unlock()
272280
if p.children != nil {
273281
delete(p.children, child)
274282
}
@@ -288,11 +296,12 @@ type cancelCtx struct {
288296

289297
done Channel // closed by the first cancel call.
290298

291-
mu sync.Mutex
292-
canceled bool
299+
cancelLock sync.Mutex
300+
canceled bool
293301

294-
children map[canceler]bool // set to nil by the first cancel call
295-
err error // set to non-nil by the first cancel call
302+
childrenLock sync.Mutex
303+
children map[canceler]bool // set to nil by the first cancel call
304+
err error // set to non-nil by the first cancel call
296305
}
297306

298307
func (c *cancelCtx) Done() Channel {
@@ -307,32 +316,48 @@ func (c *cancelCtx) String() string {
307316
return fmt.Sprintf("%v.WithCancel", c.Context)
308317
}
309318

319+
func (c *cancelCtx) getChildren() []canceler {
320+
c.childrenLock.Lock()
321+
defer c.childrenLock.Unlock()
322+
323+
out := []canceler{}
324+
for key := range c.children {
325+
out = append(out, key)
326+
}
327+
return out
328+
}
329+
310330
// cancel closes c.done, cancels each of c's children, and, if
311331
// removeFromParent is true, removes c from its parent's children.
312332
func (c *cancelCtx) cancel(removeFromParent bool, err error) {
313-
c.mu.Lock()
333+
c.cancelLock.Lock()
314334
if c.canceled {
315-
c.mu.Unlock()
335+
c.cancelLock.Unlock()
316336
// calling cancel from multiple go routines isn't safe
317337
// avoid a data race by only allowing the first call
318338
return
319339
}
320340
c.canceled = true
321-
c.mu.Unlock()
322341

323342
if err == nil {
324343
panic("context: internal error: missing cancel error")
325344
}
326345
if c.err != nil {
346+
c.cancelLock.Unlock()
327347
return // already canceled
328348
}
329349
c.err = err
350+
c.cancelLock.Unlock()
330351
c.done.Close()
331-
for child := range c.children {
352+
353+
children := c.getChildren()
354+
for _, child := range children {
332355
// NOTE: acquiring the child's lock while holding parent's lock.
333356
child.cancel(false, err)
334357
}
358+
c.childrenLock.Lock()
335359
c.children = nil
360+
c.childrenLock.Unlock()
336361

337362
if removeFromParent {
338363
removeChild(c.Context, c)

internal/context_test.go

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,49 @@ import (
2828
"github.com/stretchr/testify/assert"
2929
)
3030

31-
func TestContext_RaceRegression(t *testing.T) {
31+
func TestContextChildParentCancelRace(t *testing.T) {
32+
/*
33+
Testing previous race happened while child and parent cancelling at the same time
34+
While child is trying to remove itself from the parent, parent tries to iterate
35+
its children and cancel them at the same time.
36+
*/
37+
env := newTestWorkflowEnv(t)
38+
39+
wf := func(ctx Context) error {
40+
parentCtx, parentCancel := WithCancel(ctx)
41+
defer parentCancel()
42+
43+
type cancelerContext struct {
44+
ctx Context
45+
canceler func()
46+
}
47+
48+
children := []cancelerContext{}
49+
numChildren := 100
50+
51+
for i := 0; i < numChildren; i++ {
52+
c, canceler := WithCancel(parentCtx)
53+
children = append(children, cancelerContext{
54+
ctx: c,
55+
canceler: canceler,
56+
})
57+
}
58+
59+
for i := 0; i < numChildren; i++ {
60+
go children[i].canceler()
61+
if i == numChildren/2 {
62+
go parentCancel()
63+
}
64+
}
65+
66+
return nil
67+
}
68+
env.RegisterWorkflow(wf)
69+
env.ExecuteWorkflow(wf)
70+
assert.NoError(t, env.GetWorkflowError())
71+
}
72+
73+
func TestContextConcurrentCancelRace(t *testing.T) {
3274
/*
3375
A race condition existed due to concurrently ending goroutines on shutdown (i.e. closing their chan without waiting
3476
on them to finish shutdown), which executed... quite a lot of non-concurrency-safe code in a concurrent way. All
@@ -57,3 +99,32 @@ func TestContext_RaceRegression(t *testing.T) {
5799
env.ExecuteWorkflow(wf)
58100
assert.NoError(t, env.GetWorkflowError())
59101
}
102+
103+
func TestContextAddChildCancelParentRace(t *testing.T) {
104+
/*
105+
It's apparently also possible to race on adding children while propagating the cancel to children.
106+
*/
107+
env := newTestWorkflowEnv(t)
108+
wf := func(ctx Context) error {
109+
ctx, cancel := WithCancel(ctx)
110+
racyCancel := func(ctx Context) {
111+
defer cancel() // defer is necessary as Sleep will never return due to Goexit
112+
defer func() {
113+
_, ccancel := WithCancel(ctx)
114+
cancel()
115+
ccancel()
116+
}()
117+
_ = Sleep(ctx, time.Hour)
118+
}
119+
// start a handful to increase odds of a race being detected
120+
for i := 0; i < 10; i++ {
121+
Go(ctx, racyCancel)
122+
}
123+
124+
_ = Sleep(ctx, time.Minute) // die early
125+
return nil
126+
}
127+
env.RegisterWorkflow(wf)
128+
env.ExecuteWorkflow(wf)
129+
assert.NoError(t, env.GetWorkflowError())
130+
}

0 commit comments

Comments
 (0)