Skip to content

Commit f023f71

Browse files
committed
Merge with better panic detection
2 parents 35b3c4a + 7db1c58 commit f023f71

File tree

2 files changed

+249
-29
lines changed

2 files changed

+249
-29
lines changed

tools/flakeguard/runner/runner.go

Lines changed: 98 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -384,8 +384,9 @@ func (r *Runner) parseTestResults(runPrefix string, runCount int) ([]reports.Tes
384384
if panicDetectionMode {
385385
panicTest, timeout, err := attributePanicToTest(outputs)
386386
if err != nil {
387-
log.Warn().Err(err).Msg("Unable to attribute panic to a test")
388-
panicTest = "UnableToAttributePanicTestPleaseInvestigate"
387+
log.Error().Msg("Unable to attribute panic to a test")
388+
fmt.Println(err.Error())
389+
panicTest = "UnableToAttributePanicToTestPleaseInvestigate"
389390
}
390391
panicTestKey := fmt.Sprintf("%s/%s", entryLine.Package, panicTest)
391392

@@ -421,7 +422,8 @@ func (r *Runner) parseTestResults(runPrefix string, runCount int) ([]reports.Tes
421422
} else if raceDetectionMode {
422423
raceTest, err := attributeRaceToTest(outputs)
423424
if err != nil {
424-
log.Warn().Err(err).Msg("Unable to attribute race to a test")
425+
log.Warn().Msg("Unable to attribute race to a test")
426+
fmt.Println(err.Error())
425427
raceTest = "UnableToAttributeRaceTestPleaseInvestigate"
426428
}
427429
raceTestKey := fmt.Sprintf("%s/%s", entryLine.Package, raceTest)
@@ -615,32 +617,106 @@ func (r *Runner) transformTestOutputFiles(filePaths []string) error {
615617
}
616618

617619
var (
618-
// Regex to extract a valid test function name from a panic or race message, e.g.
620+
// Regex to extract a valid test function name from a panic message.
621+
// This is the most common situation for test panics, e.g.
619622
// github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00)
620-
nestedTestNameRe = regexp.MustCompile(`\.(Test[A-Z]\w+)(?:\.[^(]+)?\s*\(`)
621-
622-
ErrFailedToAttributePanic = errors.New("failed to attribute panic to test")
623-
ErrFailedToAttributeRace = errors.New("failed to attribute race to test")
623+
nestedTestNameRe = regexp.MustCompile(`\.(Test[^\s]+?)(?:\.[^(]+)?\s*\(`)
624+
625+
ErrFailedToAttributePanicToTest = errors.New("failed to attribute panic to test")
626+
ErrFailedToAttributeRaceToTest = errors.New("failed to attribute race to test")
627+
ErrFailedToParseTimeoutDuration = errors.New("failed to parse timeout duration")
628+
ErrFailedToExtractTimeoutDuration = errors.New("failed to extract timeout duration")
629+
ErrDetectedLogAfterCompleteFailedAttribution = errors.New("detected a log after test has completed panic, but failed to properly attribute it")
630+
ErrDetectedTimeoutFailedParse = errors.New("detected test timeout, but failed to parse the duration from the test")
631+
ErrDetectedTimeoutFailedAttribution = errors.New("detected test timeout, but failed to attribute the timeout to a specific test")
624632
)
625633

626634
// attributePanicToTest properly attributes panics to the test that caused them.
635+
// There are a lot of edge cases and strange behavior in Go test output when it comes to panics.
627636
func attributePanicToTest(outputs []string) (test string, timeout bool, err error) {
628-
// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
629-
// TestTimedOut (10m0s)
630-
timedOutTestNamRe := regexp.MustCompile(`^(Test\w+)\W+\(.*\)$`)
637+
var (
638+
// Regex to check if the panic is from a log after a goroutine, e.g.
639+
// panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: <Log line>
640+
testLogAfterTestRe = regexp.MustCompile(`^panic: Log in goroutine after (Test[^\s]+) has completed:`)
641+
642+
// Check if the panic message indicates a timeout, e.g.
643+
// panic: test timed out after 10m0s
644+
didTestTimeoutRe = regexp.MustCompile(`^panic: test timed out after ([^\s]+)`)
645+
// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
646+
// TestTimedOut (10m0s)
647+
timedOutTestNameRe = regexp.MustCompile(`^(Test[^\s]+)\s+\((.*)\)`)
648+
timeoutDurationStr string
649+
timeoutDuration time.Duration
650+
)
631651

632652
for _, output := range outputs {
633-
matchNestedTestName := nestedTestNameRe.FindStringSubmatch(output)
634-
matchTimedOutTestName := timedOutTestNamRe.FindStringSubmatch(output)
635-
if len(matchNestedTestName) > 1 {
636-
return strings.TrimSpace(matchNestedTestName[1]), false, nil
637-
} else if len(matchTimedOutTestName) > 1 {
638-
return strings.TrimSpace(matchTimedOutTestName[1]), true, nil
653+
output = strings.TrimSpace(output)
654+
// Check if the panic message indicates a timeout
655+
// If so, extract the timeout duration and switch to timeout mode
656+
if didTestTimeoutRe.MatchString(output) {
657+
timeout = true
658+
if len(didTestTimeoutRe.FindStringSubmatch(output)) > 1 {
659+
timeoutDurationStr = didTestTimeoutRe.FindStringSubmatch(output)[1]
660+
timeoutDuration, err = time.ParseDuration(timeoutDurationStr)
661+
if err != nil {
662+
return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrFailedToParseTimeoutDuration, output)
663+
}
664+
} else {
665+
return "", true, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToExtractTimeoutDuration, output)
666+
}
667+
}
668+
669+
if testLogAfterTestRe.MatchString(output) {
670+
// If the panic message indicates a log after a test, extract the test name
671+
match := testLogAfterTestRe.FindStringSubmatch(output)
672+
if len(match) > 1 {
673+
testName := strings.TrimSpace(match[1])
674+
return testName, timeout, nil
675+
} else {
676+
return "", false, fmt.Errorf(
677+
"%w using this output:\n\n%s", ErrDetectedLogAfterCompleteFailedAttribution, strings.Join(outputs, ""),
678+
)
679+
}
680+
}
681+
682+
// If in timeout mode, look for test names in the panic message and check if any match the timeout duration, e.g.
683+
// panic: test timed out after 10m0s
684+
// running tests:
685+
// TestAddAndPromoteCandidatesForNewChain (22s) // Nope
686+
// TestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s) // Nope
687+
// TestTimeout (10m0s) // Yes
688+
// TestConnectNewChain/Use_production_router_(with_MCMS) (1m1s) // Nope
689+
// TestJobSpecChangeset (0s) // Nope
690+
// Test_ActiveCandidate (1m1s) // Nope
691+
if timeout {
692+
if timedOutTestNameRe.MatchString(output) {
693+
matchTimedOutTestName := timedOutTestNameRe.FindStringSubmatch(output)
694+
if len(matchTimedOutTestName) > 1 {
695+
testName := strings.TrimSpace(matchTimedOutTestName[1])
696+
testDurationStr := strings.TrimSpace(matchTimedOutTestName[2])
697+
testDuration, err := time.ParseDuration(testDurationStr)
698+
if err != nil {
699+
return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrDetectedTimeoutFailedParse, output)
700+
}
701+
if testDuration >= timeoutDuration {
702+
return testName, true, nil
703+
}
704+
}
705+
}
706+
} else {
707+
matchNestedTestName := nestedTestNameRe.FindStringSubmatch(output)
708+
if len(matchNestedTestName) > 1 {
709+
return strings.TrimSpace(matchNestedTestName[1]), false, nil
710+
}
639711
}
640712
}
641-
return "", false, fmt.Errorf("%w, using regex '%s' and '%s' on these strings:\n\n%s",
642-
ErrFailedToAttributePanic, nestedTestNameRe.String(), timedOutTestNamRe.String(), strings.Join(outputs, ""),
643-
)
713+
// If we reach here, we couldn't attribute the panic to a test in the loop
714+
715+
if timeout {
716+
return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrDetectedTimeoutFailedAttribution, strings.Join(outputs, ""))
717+
}
718+
719+
return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToAttributePanicToTest, strings.Join(outputs, ""))
644720
}
645721

646722
// attributeRaceToTest properly attributes races to the test that caused them.
@@ -651,8 +727,8 @@ func attributeRaceToTest(outputs []string) (string, error) {
651727
return strings.TrimSpace(match[1]), nil
652728
}
653729
}
654-
return "", fmt.Errorf("%w, using regex '%s' on these strings:\n\n%s",
655-
ErrFailedToAttributeRace, nestedTestNameRe.String(), strings.Join(outputs, ""),
730+
return "", fmt.Errorf("%w, using this output:\n\n%s",
731+
ErrFailedToAttributeRaceToTest, strings.Join(outputs, ""),
656732
)
657733
}
658734

tools/flakeguard/runner/runner_test.go

Lines changed: 151 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -443,13 +443,54 @@ func TestAttributePanicToTest(t *testing.T) {
443443
},
444444
},
445445
{
446-
name: "timeout panic",
446+
name: "log after test complete panic",
447+
expectedTestName: "Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing",
448+
expectedTimeout: false,
449+
outputs: []string{
450+
"panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: 2025-03-28T17:18:16.703Z\tDEBUG\tCapabilitiesRegistry\tcapabilities/registry.go:69\tget capability\t{\"version\": \"unset@unset\", \"id\": \"[email protected]\"}",
451+
"goroutine 646 [running]:",
452+
"testing.(*common).logDepth(0xc000728000, {0xc0001b9400, 0x9b}, 0x3)",
453+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1064 +0x69f",
454+
"testing.(*common).log(...)",
455+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1046",
456+
"testing.(*common).Logf(0xc000728000, {0x6000752, 0x2}, {0xc001070430, 0x1, 0x1})",
457+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1097 +0x9f",
458+
"go.uber.org/zap/zaptest.TestingWriter.Write({{0x7fb811aa2818?, 0xc000728000?}, 0x20?}, {0xc001074000, 0x9c, 0x400})",
459+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zaptest/logger.go:146 +0x11d",
460+
"go.uber.org/zap/zapcore.(*ioCore).Write(0xc000bff1d0, {0xff, {0xc1f1d45629e99436, 0x252667087c, 0x87b3fa0}, {0x602a730, 0x14}, {0x601d42f, 0xe}, {0x1, ...}, ...}, ...)",
461+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zapcore/core.go:99 +0x18e",
462+
"go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc00106dba0, {0xc00101d400, 0x1, 0x2})",
463+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zapcore/entry.go:253 +0x1ed",
464+
"go.uber.org/zap.(*SugaredLogger).log(0xc0001e48b8, 0xff, {0x601d42f, 0xe}, {0x0, 0x0, 0x0}, {0xc00101bf40, 0x2, 0x2})",
465+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/sugar.go:355 +0x12d",
466+
"go.uber.org/zap.(*SugaredLogger).Debugw(...)",
467+
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/sugar.go:251",
468+
"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).Get(0xc000ab88c0, {0x20?, 0x87bb320?}, {0xc0013282a0, 0x18})",
469+
"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:69 +0x1cf",
470+
"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).GetTrigger(0xc000ab88c0, {0x67d38a8, 0xc0011f22d0}, {0xc0013282a0, 0x18})",
471+
"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:80 +0x6f",
472+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).resolveWorkflowCapabilities(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
473+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:198 +0x173",
474+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init.func1()",
475+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:348 +0x2aa",
476+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.retryable({0x67d38a8, 0xc0011f22d0}, {0x680c850, 0xc000e08210}, 0x1388, 0x0, 0xc000f0bf08)",
477+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/retry.go:45 +0x402",
478+
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
479+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:339 +0x225",
480+
"created by github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).Start.func1 in goroutine 390",
481+
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:179 +0xf37",
482+
"FAIL\tgithub.com/smartcontractkit/chainlink/v2/core/services/workflows/syncer\t159.643s",
483+
},
484+
},
485+
{
486+
name: "timeout panic with obvious culprit",
447487
expectedTestName: "TestTimedOut",
448488
expectedTimeout: true,
449489
outputs: []string{
450490
"panic: test timed out after 10m0s",
451491
"running tests",
452-
"TestTimedOut (10m0s)",
492+
"\tTestNoTimeout (9m59s)",
493+
"\tTestTimedOut (10m0s)",
453494
"goroutine 397631 [running]:",
454495
"testing.(*M).startAlarm.func1()",
455496
"\t/opt/hostedtoolcache/go/1.23.3/x64/src/testing/testing.go:2373 +0x385",
@@ -575,11 +616,15 @@ func TestFailToAttributePanicToTest(t *testing.T) {
575616
t.Parallel()
576617

577618
testCases := []struct {
578-
name string
579-
outputs []string
619+
name string
620+
expectedTimeout bool
621+
expectedError error
622+
outputs []string
580623
}{
581624
{
582-
name: "no test name in panic",
625+
name: "no test name in panic",
626+
expectedTimeout: false,
627+
expectedError: ErrFailedToAttributePanicToTest,
583628
outputs: []string{
584629
"panic: reflect: Elem of invalid type bool",
585630
"goroutine 104182 [running]:",
@@ -617,7 +662,105 @@ func TestFailToAttributePanicToTest(t *testing.T) {
617662
},
618663
},
619664
{
620-
name: "possible regex trip-up",
665+
name: "fail to parse timeout duration",
666+
expectedTimeout: true,
667+
expectedError: ErrFailedToParseTimeoutDuration,
668+
outputs: []string{
669+
"panic: test timed out after malformedDurationStr\n",
670+
"\trunning tests:\n",
671+
"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
672+
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
673+
"\t\tTestAlmostPanicTime (9m59s)\n",
674+
"\t\tTestConnectNewChain (1m1s)\n",
675+
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
676+
"\t\tTestJobSpecChangeset (0s)\n",
677+
"\t\tTest_ActiveCandidate (1m1s)\n",
678+
"goroutine 971967 [running]:\n",
679+
"testing.(*M).startAlarm.func1()\n",
680+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
681+
"created by time.goFunc\n",
682+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
683+
"goroutine 1 [chan receive]:\n",
684+
"testing.tRunner.func1()\n",
685+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
686+
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
687+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
688+
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
689+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
690+
"testing.(*M).Run(0xc0014732c0)\n",
691+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
692+
"main.main()\n",
693+
"\t_testmain.go:119 +0x165\n",
694+
},
695+
},
696+
{
697+
name: "fail to parse test duration",
698+
expectedTimeout: true,
699+
expectedError: ErrDetectedTimeoutFailedParse,
700+
outputs: []string{
701+
"panic: test timed out after 10m0s\n",
702+
"\trunning tests:\n",
703+
"\t\tTestAddAndPromoteCandidatesForNewChain (malformedDurationStr)\n",
704+
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
705+
"\t\tTestAlmostPanicTime (9m59s)\n",
706+
"\t\tTestConnectNewChain (1m1s)\n",
707+
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
708+
"\t\tTestJobSpecChangeset (0s)\n",
709+
"\t\tTest_ActiveCandidate (1m1s)\n",
710+
"goroutine 971967 [running]:\n",
711+
"testing.(*M).startAlarm.func1()\n",
712+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
713+
"created by time.goFunc\n",
714+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
715+
"goroutine 1 [chan receive]:\n",
716+
"testing.tRunner.func1()\n",
717+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
718+
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
719+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
720+
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
721+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
722+
"testing.(*M).Run(0xc0014732c0)\n",
723+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
724+
"main.main()\n",
725+
"\t_testmain.go:119 +0x165\n",
726+
},
727+
},
728+
{
729+
name: "timeout panic without obvious culprit",
730+
expectedTimeout: true,
731+
expectedError: ErrDetectedTimeoutFailedAttribution,
732+
outputs: []string{
733+
"panic: test timed out after 10m0s\n",
734+
"\trunning tests:\n",
735+
"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
736+
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
737+
"\t\tTestAlmostPanicTime (9m59s)\n",
738+
"\t\tTestConnectNewChain (1m1s)\n",
739+
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
740+
"\t\tTestJobSpecChangeset (0s)\n",
741+
"\t\tTest_ActiveCandidate (1m1s)\n",
742+
"goroutine 971967 [running]:\n",
743+
"testing.(*M).startAlarm.func1()\n",
744+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
745+
"created by time.goFunc\n",
746+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
747+
"goroutine 1 [chan receive]:\n",
748+
"testing.tRunner.func1()\n",
749+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
750+
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
751+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
752+
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
753+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
754+
"testing.(*M).Run(0xc0014732c0)\n",
755+
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
756+
"main.main()\n",
757+
"\t_testmain.go:119 +0x165\n",
758+
},
759+
},
760+
{
761+
name: "possible regex trip-up",
762+
expectedTimeout: false,
763+
expectedError: ErrFailedToAttributePanicToTest,
621764
outputs: []string{
622765
"panic: runtime error: invalid memory address or nil pointer dereference\n",
623766
"[signal SIGSEGV: segmentation violation code=0x1 addr=0x18 pc=0x21589cc]\n",
@@ -759,9 +902,10 @@ func TestFailToAttributePanicToTest(t *testing.T) {
759902
for _, tc := range testCases {
760903
t.Run(tc.name, func(t *testing.T) {
761904
testName, timeout, err := attributePanicToTest(tc.outputs)
905+
assert.Equal(t, tc.expectedTimeout, timeout, "timeout flag mismatch")
762906
require.Error(t, err)
907+
assert.ErrorIs(t, err, tc.expectedError, "error mismatch")
763908
assert.Empty(t, testName, "test name should be empty")
764-
assert.False(t, timeout, "timeout flag should be false")
765909
})
766910
}
767911
}

0 commit comments

Comments
 (0)