Maintain a stable order of children context, resolves a non-determinism around cancels (#1183)

Groxx · web-flow · commit dcaec7737070 · 2022-10-31T19:03:39.000-07:00
After some tough-to-identify determinism issues in what appeared to be correct user workflows, and some investigations by both us and them:
This PR resolves a non-deterministic behavior involving child context cancellation propagation, in particular when unblocking selects based on those contexts (possibly transitively, e.g. via activity futures).

As this was previously non-deterministic behavior, both the previous and new code _could_ cause determinism failures after upgrading... but the random execution order previously stood a good chance of failing a few times and then automatically resolving itself.  Unfortunately that is not maintained here - failures are likely to be permanent.

Resolving this is... probably not feasible currently.  We do not record client-library versions in workflow history, so we cannot maintain backwards compatibility accurately in scenarios like this.  We almost certainly _should_ record this on decisions, at least when it changes - we could randomly cancel entries in the list when replaying old decisions, and allow the random behavior to eventually choose a stable execution on a host somewhere.

In any case, for all future workflows this makes behavior deterministic, and should resolve the issue for good.

---

A full repro can be seen with:
1. Create multiple cancellable child contexts off a single cancellable parent context, populating its child-context map.
2. Base some behavior off each child context.  Any one-shot logic works, but activities are pretty easy and occur a lot in practice (i.e. waiting on N activities, and being able to cancel many at once).
3. Block on the selector.
4. Cancel the parent context.  This will:
    1. Cancel the parent context
    2. Propagate that to a _random_ child context
    3. Which will synchronously resolve the future(s) attached to the child context
    4. Which will synchronously trigger any pending callbacks
    5. One of which is a "first call wins" closure which the selector uses to choose which branch to execute

Maintaining the children contexts in _an_ order resolves this, as it ensures the same child is canceled first (then second, etc) each time.  Any order should work.
For clearer semantics, I chose to implement it as a compacting FIFO list (as children can remove themselves if they are cancelled independently).  This is not noticeably costly (maintenance in a large list will be dwarfed by any side effects of canceling) and it makes it very easy to define and hopefully maintain, as it _must not_ be changed.

---

This order decision _will not_ be a defined semantic of workflows, however.  Cancellation of multiple futures / selector branches _should_ be treated as unordered, and implementing exactly the same behavior in other languages may not be efficient.
In a future implementation it may be worth making selectors choose from _any_ available branch pseudo-randomly, e.g. by run-ID, for the same reason Go explicitly randomizes these behaviors: it prevents accidentally depending on implementation details, by exposing logical flaws sooner.
diff --git a/internal/context.go b/internal/context.go
@@ -230,10 +230,7 @@ func propagateCancel(parent Context, child canceler) {
 			child.cancel(false, p.err)
 		} else {
 			p.childrenLock.Lock()
-			if p.children == nil {
-				p.children = make(map[canceler]bool)
-			}
-			p.children[child] = true
+			p.children = append(p.children, child)
 			p.childrenLock.Unlock()
 			p.cancelLock.Unlock()
 		}
@@ -258,7 +255,7 @@ func parentCancelCtx(parent Context) (*cancelCtx, bool) {
 		case *cancelCtx:
 			return c, true
 		// TODO: Uncomment once timer story is implemented
-		//case *timerCtx:
+		// case *timerCtx:
 		//	return c.cancelCtx, true
 		case *valueCtx:
 			parent = c.Context
@@ -278,10 +275,31 @@ func removeChild(parent Context, child canceler) {
 	p.childrenLock.Lock()
 	defer p.childrenLock.Unlock()
 	if p.children != nil {
-		delete(p.children, child)
+		removeChildFromSlice(p.children, child)
 	}
 }
 
+// Helper to remove a child from a context's canceler list.
+// There should only ever be one instance per list due to code elsewhere,
+// but this func does not check or enforce that.
+func removeChildFromSlice(children []canceler, child canceler) []canceler {
+	// This maintains the original order, mostly because it makes behavior easier to reason about
+	// in case that becomes necessary (e.g. bug hunting).
+	// Out-of-order (move last item into the gap) is equally correct and slightly more efficient,
+	// but this likely cannot be changed without changing the order of code execution.
+	found := -1
+	for idx, c := range children {
+		if c == child {
+			found = idx
+			break
+		}
+	}
+	if found >= 0 {
+		children = append(children[:found], children[found+1:]...)
+	}
+	return children
+}
+
 // A canceler is a context type that can be canceled directly.  The
 // implementations are *cancelCtx and *timerCtx.
 type canceler interface {
@@ -300,8 +318,8 @@ type cancelCtx struct {
 	canceled   bool
 
 	childrenLock sync.Mutex
-	children     map[canceler]bool // set to nil by the first cancel call
-	err          error             // set to non-nil by the first cancel call
+	children     []canceler
+	err          error // set to non-nil by the first cancel call
 }
 
 func (c *cancelCtx) Done() Channel {
@@ -320,11 +338,9 @@ func (c *cancelCtx) getChildren() []canceler {
 	c.childrenLock.Lock()
 	defer c.childrenLock.Unlock()
 
-	out := []canceler{}
-	for key := range c.children {
-		out = append(out, key)
-	}
-	return out
+	dup := make([]canceler, len(c.children))
+	copy(dup, c.children)
+	return dup
 }
 
 // cancel closes c.done, cancels each of c's children, and, if
@@ -374,7 +390,7 @@ func (c *cancelCtx) cancel(removeFromParent bool, err error) {
 //
 // Canceling this context releases resources associated with it, so code should
 // call cancel as soon as the operations running in this Context complete.
-//func WithDeadline(parent Context, deadline time.Time) (Context, CancelFunc) {
+// func WithDeadline(parent Context, deadline time.Time) (Context, CancelFunc) {
 //	if cur, ok := parent.Deadline(); ok && cur.Before(deadline) {
 //		// The current deadline is already sooner than the new one.
 //		return WithCancel(parent)
@@ -395,27 +411,27 @@ func (c *cancelCtx) cancel(removeFromParent bool, err error) {
 //		})
 //	}
 //	return c, func() { c.cancel(true, Canceled) }
-//}
+// }
 //
-//// A timerCtx carries a timer and a deadline.  It embeds a cancelCtx to
-//// implement Done and Err.  It implements cancel by stopping its timer then
-//// delegating to cancelCtx.cancel.
-//type timerCtx struct {
+// // A timerCtx carries a timer and a deadline.  It embeds a cancelCtx to
+// // implement Done and Err.  It implements cancel by stopping its timer then
+// // delegating to cancelCtx.cancel.
+// type timerCtx struct {
 //	*cancelCtx
 //	timer *time.Timer // Under cancelCtx.mu.
 //
 //	deadline time.Time
-//}
+// }
 //
-//func (c *timerCtx) Deadline() (deadline time.Time, ok bool) {
+// func (c *timerCtx) Deadline() (deadline time.Time, ok bool) {
 //	return c.deadline, true
-//}
+// }
 //
-//func (c *timerCtx) String() string {
+// func (c *timerCtx) String() string {
 //	return fmt.Sprintf("%v.WithDeadline(%s [%s])", c.cancelCtx.Context, c.deadline, c.deadline.Sub(time.Now()))
-//}
+// }
 //
-//func (c *timerCtx) cancel(removeFromParent bool, err error) {
+// func (c *timerCtx) cancel(removeFromParent bool, err error) {
 //	c.cancelCtx.cancel(false, err)
 //	if removeFromParent {
 //		// Remove this timerCtx from its parent cancelCtx's children.
@@ -425,7 +441,7 @@ func (c *cancelCtx) cancel(removeFromParent bool, err error) {
 //		c.timer.Stop()
 //		c.timer = nil
 //	}
-//}
+// }
 //
 // WithTimeout returns WithDeadline(parent, time.Now().Add(timeout)).
 //
@@ -437,9 +453,9 @@ func (c *cancelCtx) cancel(removeFromParent bool, err error) {
 // 		defer cancel()  // releases resources if slowOperation completes before timeout elapses
 // 		return slowOperation(ctx)
 // 	}
-//func WithTimeout(parent Context, timeout time.Duration) (Context, CancelFunc) {
+// func WithTimeout(parent Context, timeout time.Duration) (Context, CancelFunc) {
 //	return WithDeadline(parent, time.Now().Add(timeout))
-//}
+// }
 
 // WithValue returns a copy of parent in which the value associated with key is
 // val.
diff --git a/internal/context_test.go b/internal/context_test.go
@@ -22,10 +22,13 @@
 package internal
 
 import (
+	"context"
 	"testing"
 	"time"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
 )
 
 func TestContextChildParentCancelRace(t *testing.T) {
@@ -128,3 +131,132 @@ func TestContextAddChildCancelParentRace(t *testing.T) {
 	env.ExecuteWorkflow(wf)
 	assert.NoError(t, env.GetWorkflowError())
 }
+
+func TestContextCancellationOrderDeterminism(t *testing.T) {
+	/*
+		Previously, child-contexts were stored in a map, preventing deterministic order when propagating cancellation.
+		The order of branches being selected in this test was random, both for the first event and in following ones.
+
+		In principle this should be fine, but it's possible for the effects of cancellation to trigger a selector's
+		future-done callback, which currently records the *real-time*-first event as the branch to unblock, rather than
+		doing something more safe by design (e.g. choosing based on state when the selector's goroutine is unblocked).
+
+		Unfortunately, we cannot change the selector's behavior without introducing non-backwards-compatible changes to
+		currently-working workflows.
+
+		So the workaround for now is to maintain child-context order, so they are canceled in a consistent order.
+		As this order was not controlled before, and Go does a pretty good job at randomizing map iteration order,
+		converting non-determinism to determinism should be strictly no worse for backwards compatibility, and it
+		fixes the issue for future executions.
+	*/
+	check := func(t *testing.T, separateStart, separateSelect bool) {
+		env := newTestWorkflowEnv(t)
+		act := func(ctx context.Context) error {
+			return nil // will be mocked
+		}
+		wf := func(ctx Context) ([]int, error) {
+			ctx, cancel := WithCancel(ctx)
+			Go(ctx, func(ctx Context) {
+				_ = Sleep(ctx, time.Minute)
+				cancel()
+			})
+
+			// start some activities, which will not complete before the timeout cancels them
+			ctx = WithActivityOptions(ctx, ActivityOptions{
+				TaskList:               "",
+				ScheduleToCloseTimeout: time.Hour,
+				ScheduleToStartTimeout: time.Hour,
+				StartToCloseTimeout:    time.Hour,
+			})
+			s := NewSelector(ctx)
+			var result []int
+			for i := 0; i < 10; i++ {
+				i := i
+				// need a child context, a future alone is not enough as it does not become a child
+				cctx, ccancel := WithCancel(ctx)
+
+				s.AddFuture(ExecuteActivity(cctx, act), func(f Future) {
+					ccancel() // TODO: is this necessary to prevent leaks?  if it is, how can we make it not?
+					err := f.Get(ctx, nil)
+					if err == nil || !IsCanceledError(err) {
+						// fail the test, this should not happen - activities must be canceled or it's not valid.
+						t.Errorf("activity completion or failure for some reason other than cancel: %v", err)
+					}
+					result = append(result, i)
+				})
+
+				if separateStart {
+					// yield so they are submitted one at a time, in case that matters
+					_ = Sleep(ctx, time.Second)
+				}
+			}
+			for i := 0; i < 10; i++ {
+				if separateSelect {
+					// yield so they are selected one at a time, in case that matters
+					_ = Sleep(ctx, time.Second)
+				}
+				s.Select(ctx)
+			}
+
+			return result, nil
+		}
+		env.RegisterWorkflow(wf)
+		env.RegisterActivity(act)
+
+		// activities must not complete in time
+		env.OnActivity(act, mock.Anything).After(5 * time.Minute).Return(nil)
+
+		env.ExecuteWorkflow(wf)
+		require.NoError(t, env.GetWorkflowError())
+		var result []int
+		require.NoError(t, env.GetWorkflowResult(&result))
+		require.NotEmpty(t, result)
+		assert.Equal(t, 0, result[0], "first activity to be created should be the first one canceled")
+		assert.Equal(t, []int{1, 2, 3, 4, 5, 6, 7, 8, 9}, result[1:], "other activities should finish in a consistent (but undefined) order")
+	}
+
+	type variant struct {
+		name           string
+		separateStart  bool
+		separateSelect bool
+	}
+	// all variants expose this behavior, but being a bit more exhaustive in the face
+	// of decision-scheduling differences seems good.
+	for _, test := range []variant{
+		{"many in one decision", false, false},
+		{"many started at once, selected slowly", false, true},
+		{"started slowly, selected quickly", true, false},
+		{"started and selected slowly", true, true},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			check(t, test.separateStart, test.separateSelect)
+		})
+	}
+}
+
+func BenchmarkSliceMaintenance(b *testing.B) {
+	// all essentially identical
+	b.Run("append", func(b *testing.B) {
+		data := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+		for i := 0; i < b.N; i++ {
+			data = append(data[:5], data[6:]...)
+			data = append(data, i) // keep the slice the same size for all iterations
+		}
+	})
+	b.Run("copy", func(b *testing.B) {
+		data := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+		for i := 0; i < b.N; i++ {
+			copy(data[5:], data[6:])
+			data = data[:9]        // trim to actual size, as the last value is now duplicated.  capacity is still 10.
+			data = append(data, i) // keep the slice the same size for all iterations
+		}
+	})
+	b.Run("copy explicit capacity", func(b *testing.B) {
+		data := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+		for i := 0; i < b.N; i++ {
+			copy(data[5:], data[6:])
+			data = data[:9:10]     // trim to actual size, as the last value is now duplicated.  explicitly reserve 10 cap.
+			data = append(data, i) // keep the slice the same size for all iterations
+		}
+	})
+}