Skip to content

Commit 7db1c58

Browse files
authored
Accounts for more panic scenarios (#1741)
1 parent abd0c96 commit 7db1c58

File tree

2 files changed

+250
-26
lines changed

2 files changed

+250
-26
lines changed

tools/flakeguard/runner/runner.go

Lines changed: 99 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,9 @@ func (r *Runner) parseTestResults(runPrefix string, runCount int) ([]reports.Tes
383383
}
384384
panicTest, timeout, err := attributePanicToTest(outputs)
385385
if err != nil {
386-
log.Warn().Err(err).Msg("Unable to attribute panic to a test")
387-
panicTest = "UnableToAttributePanicTestPleaseInvestigate"
386+
log.Error().Msg("Unable to attribute panic to a test")
387+
fmt.Println(err.Error())
388+
panicTest = "UnableToAttributePanicToTestPleaseInvestigate"
388389
}
389390
panicTestKey := fmt.Sprintf("%s/%s", entryLine.Package, panicTest)
390391

@@ -612,27 +613,106 @@ func (r *Runner) transformTestOutputFiles(filePaths []string) error {
612613
return nil
613614
}
614615

616+
var (
617+
ErrFailedToAttributePanicToTest = errors.New("failed to attribute panic to test")
618+
ErrFailedToParseTimeoutDuration = errors.New("failed to parse timeout duration")
619+
ErrFailedToExtractTimeoutDuration = errors.New("failed to extract timeout duration")
620+
ErrDetectedLogAfterCompleteFailedAttribution = errors.New("detected a log after test has completed panic, but failed to properly attribute it")
621+
ErrDetectedTimeoutFailedParse = errors.New("detected test timeout, but failed to parse the duration from the test")
622+
ErrDetectedTimeoutFailedAttribution = errors.New("detected test timeout, but failed to attribute the timeout to a specific test")
623+
)
624+
615625
// attributePanicToTest properly attributes panics to the test that caused them.
626+
// There are a lot of edge cases and strange behavior in Go test output when it comes to panics.
616627
func attributePanicToTest(outputs []string) (test string, timeout bool, err error) {
617-
// Regex to extract a valid test function name from a panic message, e.g.
618-
// github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00)
619-
nestedTestNameRe := regexp.MustCompile(`\.(Test[A-Z]\w+)(?:\.[^(]+)?\s*\(`)
620-
// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
621-
// TestTimedOut (10m0s)
622-
timedOutTestNamRe := regexp.MustCompile(`^(Test\w+)\W+\(.*\)$`)
623-
624-
for _, o := range outputs {
625-
outputs = append(outputs, o)
626-
matchNestedTestName := nestedTestNameRe.FindStringSubmatch(o)
627-
matchTimedOutTestName := timedOutTestNamRe.FindStringSubmatch(o)
628-
if len(matchNestedTestName) > 1 {
629-
return strings.TrimSpace(matchNestedTestName[1]), false, nil
630-
} else if len(matchTimedOutTestName) > 1 {
631-
return strings.TrimSpace(matchTimedOutTestName[1]), true, nil
628+
var (
629+
// Regex to extract a valid test function name from a panic message.
630+
// This is the most common situation for test panics, e.g.
631+
// github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00)
632+
nestedTestNameRe = regexp.MustCompile(`\.(Test[^\s]+?)(?:\.[^(]+)?\s*\(`)
633+
634+
// Regex to check if the panic is from a log after a goroutine, e.g.
635+
// panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: <Log line>
636+
testLogAfterTestRe = regexp.MustCompile(`^panic: Log in goroutine after (Test[^\s]+) has completed:`)
637+
638+
// Check if the panic message indicates a timeout, e.g.
639+
// panic: test timed out after 10m0s
640+
didTestTimeoutRe = regexp.MustCompile(`^panic: test timed out after ([^\s]+)`)
641+
// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
642+
// TestTimedOut (10m0s)
643+
timedOutTestNameRe = regexp.MustCompile(`^(Test[^\s]+)\s+\((.*)\)`)
644+
timeoutDurationStr string
645+
timeoutDuration time.Duration
646+
)
647+
648+
for _, output := range outputs {
649+
output = strings.TrimSpace(output)
650+
// Check if the panic message indicates a timeout
651+
// If so, extract the timeout duration and switch to timeout mode
652+
if didTestTimeoutRe.MatchString(output) {
653+
timeout = true
654+
if len(didTestTimeoutRe.FindStringSubmatch(output)) > 1 {
655+
timeoutDurationStr = didTestTimeoutRe.FindStringSubmatch(output)[1]
656+
timeoutDuration, err = time.ParseDuration(timeoutDurationStr)
657+
if err != nil {
658+
return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrFailedToParseTimeoutDuration, output)
659+
}
660+
} else {
661+
return "", true, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToExtractTimeoutDuration, output)
662+
}
663+
}
664+
665+
if testLogAfterTestRe.MatchString(output) {
666+
// If the panic message indicates a log after a test, extract the test name
667+
match := testLogAfterTestRe.FindStringSubmatch(output)
668+
if len(match) > 1 {
669+
testName := strings.TrimSpace(match[1])
670+
return testName, timeout, nil
671+
} else {
672+
return "", false, fmt.Errorf(
673+
"%w using this output:\n\n%s", ErrDetectedLogAfterCompleteFailedAttribution, strings.Join(outputs, ""),
674+
)
675+
}
676+
}
677+
678+
// If in timeout mode, look for test names in the panic message and check if any match the timeout duration, e.g.
679+
// panic: test timed out after 10m0s
680+
// running tests:
681+
// TestAddAndPromoteCandidatesForNewChain (22s) // Nope
682+
// TestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s) // Nope
683+
// TestTimeout (10m0s) // Yes
684+
// TestConnectNewChain/Use_production_router_(with_MCMS) (1m1s) // Nope
685+
// TestJobSpecChangeset (0s) // Nope
686+
// Test_ActiveCandidate (1m1s) // Nope
687+
if timeout {
688+
if timedOutTestNameRe.MatchString(output) {
689+
matchTimedOutTestName := timedOutTestNameRe.FindStringSubmatch(output)
690+
if len(matchTimedOutTestName) > 1 {
691+
testName := strings.TrimSpace(matchTimedOutTestName[1])
692+
testDurationStr := strings.TrimSpace(matchTimedOutTestName[2])
693+
testDuration, err := time.ParseDuration(testDurationStr)
694+
if err != nil {
695+
return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrDetectedTimeoutFailedParse, output)
696+
}
697+
if testDuration >= timeoutDuration {
698+
return testName, true, nil
699+
}
700+
}
701+
}
702+
} else {
703+
matchNestedTestName := nestedTestNameRe.FindStringSubmatch(output)
704+
if len(matchNestedTestName) > 1 {
705+
return strings.TrimSpace(matchNestedTestName[1]), false, nil
706+
}
632707
}
633708
}
634-
return "", false, fmt.Errorf("failed to attribute panic to test, using regex '%s' and '%s' on these strings:\n\n%s",
635-
nestedTestNameRe.String(), timedOutTestNamRe.String(), strings.Join(outputs, ""))
709+
// If we reach here, we couldn't attribute the panic to a test in the loop
710+
711+
if timeout {
712+
return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrDetectedTimeoutFailedAttribution, strings.Join(outputs, ""))
713+
}
714+
715+
return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToAttributePanicToTest, strings.Join(outputs, ""))
636716
}
637717

638718
// attributeRaceToTest properly attributes races to the test that caused them.

tools/flakeguard/runner/runner_test.go

Lines changed: 151 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -443,13 +443,54 @@ func TestAttributePanicToTest(t *testing.T) {
443443
},
444444
},
445445
{
446-
name: "timeout panic",
446+
name: "log after test complete panic",
447+
expectedTestName: "Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing",
448+
expectedTimeout: false,
449+
outputs: []string{
450+
"panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: 2025-03-28T17:18:16.703Z\tDEBUG\tCapabilitiesRegistry\tcapabilities/registry.go:69\tget capability\t{\"version\": \"unset@unset\", \"id\": \"[email protected]\"}",
451+
"goroutine 646 [running]:",
452+
"testing.(*common).logDepth(0xc000728000, {0xc0001b9400, 0x9b}, 0x3)",
453+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1064 +0x69f",
454+
"testing.(*common).log(...)",
455+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1046",
456+
"testing.(*common).Logf(0xc000728000, {0x6000752, 0x2}, {0xc001070430, 0x1, 0x1})",
457+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1097 +0x9f",
458+
"go.uber.org/zap/zaptest.TestingWriter.Write({{0x7fb811aa2818?, 0xc000728000?}, 0x20?}, {0xc001074000, 0x9c, 0x400})",
459+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zaptest/logger.go:146 +0x11d",
460+
"go.uber.org/zap/zapcore.(*ioCore).Write(0xc000bff1d0, {0xff, {0xc1f1d45629e99436, 0x252667087c, 0x87b3fa0}, {0x602a730, 0x14}, {0x601d42f, 0xe}, {0x1, ...}, ...}, ...)",
461+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zapcore/core.go:99 +0x18e",
462+
"go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc00106dba0, {0xc00101d400, 0x1, 0x2})",
463+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zapcore/entry.go:253 +0x1ed",
464+
"go.uber.org/zap.(*SugaredLogger).log(0xc0001e48b8, 0xff, {0x601d42f, 0xe}, {0x0, 0x0, 0x0}, {0xc00101bf40, 0x2, 0x2})",
465+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/sugar.go:355 +0x12d",
466+
"go.uber.org/zap.(*SugaredLogger).Debugw(...)",
467+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/sugar.go:251",
468+
"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).Get(0xc000ab88c0, {0x20?, 0x87bb320?}, {0xc0013282a0, 0x18})",
469+
"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:69 +0x1cf",
470+
"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).GetTrigger(0xc000ab88c0, {0x67d38a8, 0xc0011f22d0}, {0xc0013282a0, 0x18})",
471+
"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:80 +0x6f",
472+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).resolveWorkflowCapabilities(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
473+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:198 +0x173",
474+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init.func1()",
475+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:348 +0x2aa",
476+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.retryable({0x67d38a8, 0xc0011f22d0}, {0x680c850, 0xc000e08210}, 0x1388, 0x0, 0xc000f0bf08)",
477+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/retry.go:45 +0x402",
478+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
479+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:339 +0x225",
480+
"created by github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).Start.func1 in goroutine 390",
481+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:179 +0xf37",
482+
"FAIL\tgithub.com/smartcontractkit/chainlink/v2/core/services/workflows/syncer\t159.643s",
483+
},
484+
},
485+
{
486+
name: "timeout panic with obvious culprit",
447487
expectedTestName: "TestTimedOut",
448488
expectedTimeout: true,
449489
outputs: []string{
450490
"panic: test timed out after 10m0s",
451491
"running tests",
452-
"TestTimedOut (10m0s)",
492+
"\tTestNoTimeout (9m59s)",
493+
"\tTestTimedOut (10m0s)",
453494
"goroutine 397631 [running]:",
454495
"testing.(*M).startAlarm.func1()",
455496
"\t/opt/hostedtoolcache/go/1.23.3/x64/src/testing/testing.go:2373 +0x385",
@@ -575,11 +616,15 @@ func TestFailToAttributePanicToTest(t *testing.T) {
575616
t.Parallel()
576617

577618
testCases := []struct {
578-
name string
579-
outputs []string
619+
name string
620+
expectedTimeout bool
621+
expectedError error
622+
outputs []string
580623
}{
581624
{
582-
name: "no test name in panic",
625+
name: "no test name in panic",
626+
expectedTimeout: false,
627+
expectedError: ErrFailedToAttributePanicToTest,
583628
outputs: []string{
584629
"panic: reflect: Elem of invalid type bool",
585630
"goroutine 104182 [running]:",
@@ -617,7 +662,105 @@ func TestFailToAttributePanicToTest(t *testing.T) {
617662
},
618663
},
619664
{
620-
name: "possible regex trip-up",
665+
name: "fail to parse timeout duration",
666+
expectedTimeout: true,
667+
expectedError: ErrFailedToParseTimeoutDuration,
668+
outputs: []string{
669+
"panic: test timed out after malformedDurationStr\n",
670+
"\trunning tests:\n",
671+
"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
672+
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
673+
"\t\tTestAlmostPanicTime (9m59s)\n",
674+
"\t\tTestConnectNewChain (1m1s)\n",
675+
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
676+
"\t\tTestJobSpecChangeset (0s)\n",
677+
"\t\tTest_ActiveCandidate (1m1s)\n",
678+
"goroutine 971967 [running]:\n",
679+
"testing.(*M).startAlarm.func1()\n",
680+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
681+
"created by time.goFunc\n",
682+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
683+
"goroutine 1 [chan receive]:\n",
684+
"testing.tRunner.func1()\n",
685+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
686+
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
687+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
688+
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
689+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
690+
"testing.(*M).Run(0xc0014732c0)\n",
691+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
692+
"main.main()\n",
693+
"\t_testmain.go:119 +0x165\n",
694+
},
695+
},
696+
{
697+
name: "fail to parse test duration",
698+
expectedTimeout: true,
699+
expectedError: ErrDetectedTimeoutFailedParse,
700+
outputs: []string{
701+
"panic: test timed out after 10m0s\n",
702+
"\trunning tests:\n",
703+
"\t\tTestAddAndPromoteCandidatesForNewChain (malformedDurationStr)\n",
704+
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
705+
"\t\tTestAlmostPanicTime (9m59s)\n",
706+
"\t\tTestConnectNewChain (1m1s)\n",
707+
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
708+
"\t\tTestJobSpecChangeset (0s)\n",
709+
"\t\tTest_ActiveCandidate (1m1s)\n",
710+
"goroutine 971967 [running]:\n",
711+
"testing.(*M).startAlarm.func1()\n",
712+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
713+
"created by time.goFunc\n",
714+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
715+
"goroutine 1 [chan receive]:\n",
716+
"testing.tRunner.func1()\n",
717+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
718+
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
719+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
720+
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
721+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
722+
"testing.(*M).Run(0xc0014732c0)\n",
723+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
724+
"main.main()\n",
725+
"\t_testmain.go:119 +0x165\n",
726+
},
727+
},
728+
{
729+
name: "timeout panic without obvious culprit",
730+
expectedTimeout: true,
731+
expectedError: ErrDetectedTimeoutFailedAttribution,
732+
outputs: []string{
733+
"panic: test timed out after 10m0s\n",
734+
"\trunning tests:\n",
735+
"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
736+
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
737+
"\t\tTestAlmostPanicTime (9m59s)\n",
738+
"\t\tTestConnectNewChain (1m1s)\n",
739+
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
740+
"\t\tTestJobSpecChangeset (0s)\n",
741+
"\t\tTest_ActiveCandidate (1m1s)\n",
742+
"goroutine 971967 [running]:\n",
743+
"testing.(*M).startAlarm.func1()\n",
744+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
745+
"created by time.goFunc\n",
746+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
747+
"goroutine 1 [chan receive]:\n",
748+
"testing.tRunner.func1()\n",
749+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
750+
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
751+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
752+
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
753+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
754+
"testing.(*M).Run(0xc0014732c0)\n",
755+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
756+
"main.main()\n",
757+
"\t_testmain.go:119 +0x165\n",
758+
},
759+
},
760+
{
761+
name: "possible regex trip-up",
762+
expectedTimeout: false,
763+
expectedError: ErrFailedToAttributePanicToTest,
621764
outputs: []string{
622765
"panic: runtime error: invalid memory address or nil pointer dereference\n",
623766
"[signal SIGSEGV: segmentation violation code=0x1 addr=0x18 pc=0x21589cc]\n",
@@ -759,9 +902,10 @@ func TestFailToAttributePanicToTest(t *testing.T) {
759902
for _, tc := range testCases {
760903
t.Run(tc.name, func(t *testing.T) {
761904
testName, timeout, err := attributePanicToTest(tc.outputs)
905+
assert.Equal(t, tc.expectedTimeout, timeout, "timeout flag mismatch")
762906
require.Error(t, err)
907+
assert.ErrorIs(t, err, tc.expectedError, "error mismatch")
763908
assert.Empty(t, testName, "test name should be empty")
764-
assert.False(t, timeout, "timeout flag should be false")
765909
})
766910
}
767911
}

0 commit comments

Comments
 (0)