Skip to content

Commit bf7add6

Browse files
craig[bot]dt
andcommitted
Merge #154037
154037: util/ctxgroup: print recovered panic stacks during crash r=jeffswenson a=jeffswenson When ctxgroup recovers a panic in a worker to be rethrown in Wait(), it uses errors.WithStack to try to capture the stack to the original panic, and subsequent recover, in the error that is stored to be rethrown on the Wait goroutine. And indeed, WithStack captures it... but only to have it ignored and not printed by the runtime crash handler, which just invokes Error(), if/when the rethrown panic bubbles up if unrecovered. This makes such panics _very_ hard to debug: all information about where they were actually raised is gone, making this rethrow-on-Wait actually much worse than just not recovering at all. This changes the capture-and-rethrow to wrap the captured error in a thin wrapper that overrides .Error() to incldue the stack, so that it is printed by the runtime crash handler if a rethrown panic makes it there. Release note: none. Epic: none. Informs: #153347 Co-authored-by: David Taylor <[email protected]>
2 parents 1345a57 + 38b6a7d commit bf7add6

File tree

2 files changed

+87
-7
lines changed

2 files changed

+87
-7
lines changed

pkg/util/ctxgroup/ctxgroup.go

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ package ctxgroup
116116

117117
import (
118118
"context"
119+
"fmt"
119120

120121
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
121122
"github.com/cockroachdb/errors"
@@ -147,7 +148,7 @@ func (g Group) Wait() error {
147148
err := g.wrapped.Wait()
148149

149150
if g.panicMu.payload != nil {
150-
panic(g.panicMu.payload)
151+
panic(verboseError{g.panicMu.payload})
151152
}
152153

153154
if err != nil {
@@ -224,15 +225,39 @@ func GoAndWait(ctx context.Context, fs ...func(ctx context.Context) error) error
224225
}
225226

226227
// wrapPanic turns r into an error if it is not one already.
227-
//
228-
// TODO(dt): consider raw recovered payload as well.
229-
//
230-
// TODO(dt): replace this with logcrash.PanicAsError after moving that to a new
231-
// standalone pkg (since we cannot depend on `util/log` here) and teaching it to
232-
// preserve the original payload.
233228
func wrapPanic(depth int, r interface{}) error {
229+
// If r is already a verboseError, we remove the `verboseError` wrapper
230+
// because it will be reapplied when the panic is rethrown by wait. The stack
231+
// trace is attached to the inner error. Its desirable to remove the verbose
232+
// format wrapper from the child because the parent verbose wrapper will
233+
// print the child errors .Error() method and the child's stack. If we left
234+
// the wrapper in place, this would print the child error's stack twice.
235+
if err, ok := r.(verboseError); ok {
236+
return errors.WithStackDepth(err.error, depth+1)
237+
}
234238
if err, ok := r.(error); ok {
235239
return errors.WithStackDepth(err, depth+1)
236240
}
237241
return errors.NewWithDepthf(depth+1, "panic: %v", r)
238242
}
243+
244+
type verboseError struct {
245+
error // always a errors.WithStack.withStack
246+
}
247+
248+
// Error overrides WithStack.withStack()'s Error to include the stack.
249+
//
250+
// Typically withstack's Error() just delegates to the underlying error and
251+
// requires formatting with %+v to print the stacktrace. However this wrapper is
252+
// used to wrap a recovered panic that is *rethrown*. If this rethrown panic is
253+
// not recovered, the runtime will eventually crash and print it... by calling
254+
// .Error(), which will *not* indicate the stack to the original panic we so
255+
// dutifully captured by using WithStack. So override .Error() to include the
256+
// stack, but leave .Format() to fallthrough to withStack as usual.
257+
func (p verboseError) Error() string {
258+
return fmt.Sprintf("%+v", p.error)
259+
}
260+
261+
func (p verboseError) Unwrap() error {
262+
return p.error
263+
}

pkg/util/ctxgroup/ctxgroup_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ func funcThatPanics(x interface{}) {
4444
panic(x)
4545
}
4646

47+
func deferThatPanics() {
48+
defer func() {
49+
panic("in deferred panic")
50+
}()
51+
funcThatPanics("in func panic")
52+
}
53+
4754
func TestPanic(t *testing.T) {
4855
defer leaktest.AfterTest(t)()
4956

@@ -65,3 +72,51 @@ func TestPanic(t *testing.T) {
6572
}()
6673
})
6774
}
75+
76+
func TestPanicInDefer(t *testing.T) {
77+
defer leaktest.AfterTest(t)()
78+
79+
defer func() {
80+
r := recover()
81+
require.NotNil(t, r, "expected panic from Wait")
82+
err := r.(error)
83+
require.Contains(t, err.Error(), "in deferred panic", "expected deferred panic message")
84+
require.Contains(t, err.Error(), "deferThatPanics", "expected deferThatPanics in stack")
85+
require.Contains(t, err.Error(), "funcThatPanics", "expected deferThatPanics in stack")
86+
t.Log("recovered: ", err.Error())
87+
}()
88+
89+
g := WithContext(context.Background())
90+
g.GoCtx(func(_ context.Context) error { deferThatPanics(); return errors.New("unseen") })
91+
t.Fatal(g.Wait(), "unreachable if we hit expected panic in Wait")
92+
}
93+
94+
func innerGroup() error {
95+
g := WithContext(context.Background())
96+
g.GoCtx(func(_ context.Context) error { funcThatPanics("inner"); return nil })
97+
return g.Wait()
98+
}
99+
100+
func outerGroup() error {
101+
g := WithContext(context.Background())
102+
g.GoCtx(func(_ context.Context) error { return innerGroup() })
103+
return g.Wait()
104+
}
105+
106+
func TestNestedContextGroup(t *testing.T) {
107+
defer leaktest.AfterTest(t)()
108+
109+
defer func() {
110+
r := recover()
111+
require.NotNil(t, r, "expected panic from Wait")
112+
err := r.(error)
113+
require.Contains(t, err.Error(), "inner", "expected inner panic message")
114+
require.Contains(t, err.Error(), "innerGroup", "expected innerGroup in stack")
115+
require.Contains(t, err.Error(), "outerGroup", "expected outerGroup in stack")
116+
t.Log("recovered: ", err.Error())
117+
}()
118+
119+
g := WithContext(context.Background())
120+
g.GoCtx(func(_ context.Context) error { return outerGroup() })
121+
t.Fatal(g.Wait(), "unreachable if we hit expected panic in Wait")
122+
}

0 commit comments

Comments
 (0)