From 08fde368c2e8db3760222419d4ff3adee416eba3 Mon Sep 17 00:00:00 2001 From: Adam Hamrick Date: Fri, 28 Mar 2025 17:11:25 -0400 Subject: [PATCH] Accounts for more panic scenarios --- tools/flakeguard/runner/runner.go | 118 +++++++++++++++--- tools/flakeguard/runner/runner_test.go | 158 +++++++++++++++++++++++-- 2 files changed, 250 insertions(+), 26 deletions(-) diff --git a/tools/flakeguard/runner/runner.go b/tools/flakeguard/runner/runner.go index c07630f93..48b35ac0d 100644 --- a/tools/flakeguard/runner/runner.go +++ b/tools/flakeguard/runner/runner.go @@ -383,8 +383,9 @@ func (r *Runner) parseTestResults(runPrefix string, runCount int) ([]reports.Tes } panicTest, timeout, err := attributePanicToTest(outputs) if err != nil { - log.Warn().Err(err).Msg("Unable to attribute panic to a test") - panicTest = "UnableToAttributePanicTestPleaseInvestigate" + log.Error().Msg("Unable to attribute panic to a test") + fmt.Println(err.Error()) + panicTest = "UnableToAttributePanicToTestPleaseInvestigate" } panicTestKey := fmt.Sprintf("%s/%s", entryLine.Package, panicTest) @@ -612,27 +613,106 @@ func (r *Runner) transformTestOutputFiles(filePaths []string) error { return nil } +var ( + ErrFailedToAttributePanicToTest = errors.New("failed to attribute panic to test") + ErrFailedToParseTimeoutDuration = errors.New("failed to parse timeout duration") + ErrFailedToExtractTimeoutDuration = errors.New("failed to extract timeout duration") + ErrDetectedLogAfterCompleteFailedAttribution = errors.New("detected a log after test has completed panic, but failed to properly attribute it") + ErrDetectedTimeoutFailedParse = errors.New("detected test timeout, but failed to parse the duration from the test") + ErrDetectedTimeoutFailedAttribution = errors.New("detected test timeout, but failed to attribute the timeout to a specific test") +) + // attributePanicToTest properly attributes panics to the test that caused them. +// There are a lot of edge cases and strange behavior in Go test output when it comes to panics. func attributePanicToTest(outputs []string) (test string, timeout bool, err error) { - // Regex to extract a valid test function name from a panic message, e.g. - // github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00) - nestedTestNameRe := regexp.MustCompile(`\.(Test[A-Z]\w+)(?:\.[^(]+)?\s*\(`) - // Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g. - // TestTimedOut (10m0s) - timedOutTestNamRe := regexp.MustCompile(`^(Test\w+)\W+\(.*\)$`) - - for _, o := range outputs { - outputs = append(outputs, o) - matchNestedTestName := nestedTestNameRe.FindStringSubmatch(o) - matchTimedOutTestName := timedOutTestNamRe.FindStringSubmatch(o) - if len(matchNestedTestName) > 1 { - return strings.TrimSpace(matchNestedTestName[1]), false, nil - } else if len(matchTimedOutTestName) > 1 { - return strings.TrimSpace(matchTimedOutTestName[1]), true, nil + var ( + // Regex to extract a valid test function name from a panic message. + // This is the most common situation for test panics, e.g. + // github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00) + nestedTestNameRe = regexp.MustCompile(`\.(Test[^\s]+?)(?:\.[^(]+)?\s*\(`) + + // Regex to check if the panic is from a log after a goroutine, e.g. + // panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: + testLogAfterTestRe = regexp.MustCompile(`^panic: Log in goroutine after (Test[^\s]+) has completed:`) + + // Check if the panic message indicates a timeout, e.g. + // panic: test timed out after 10m0s + didTestTimeoutRe = regexp.MustCompile(`^panic: test timed out after ([^\s]+)`) + // Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g. + // TestTimedOut (10m0s) + timedOutTestNameRe = regexp.MustCompile(`^(Test[^\s]+)\s+\((.*)\)`) + timeoutDurationStr string + timeoutDuration time.Duration + ) + + for _, output := range outputs { + output = strings.TrimSpace(output) + // Check if the panic message indicates a timeout + // If so, extract the timeout duration and switch to timeout mode + if didTestTimeoutRe.MatchString(output) { + timeout = true + if len(didTestTimeoutRe.FindStringSubmatch(output)) > 1 { + timeoutDurationStr = didTestTimeoutRe.FindStringSubmatch(output)[1] + timeoutDuration, err = time.ParseDuration(timeoutDurationStr) + if err != nil { + return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrFailedToParseTimeoutDuration, output) + } + } else { + return "", true, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToExtractTimeoutDuration, output) + } + } + + if testLogAfterTestRe.MatchString(output) { + // If the panic message indicates a log after a test, extract the test name + match := testLogAfterTestRe.FindStringSubmatch(output) + if len(match) > 1 { + testName := strings.TrimSpace(match[1]) + return testName, timeout, nil + } else { + return "", false, fmt.Errorf( + "%w using this output:\n\n%s", ErrDetectedLogAfterCompleteFailedAttribution, strings.Join(outputs, ""), + ) + } + } + + // If in timeout mode, look for test names in the panic message and check if any match the timeout duration, e.g. + // panic: test timed out after 10m0s + // running tests: + // TestAddAndPromoteCandidatesForNewChain (22s) // Nope + // TestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s) // Nope + // TestTimeout (10m0s) // Yes + // TestConnectNewChain/Use_production_router_(with_MCMS) (1m1s) // Nope + // TestJobSpecChangeset (0s) // Nope + // Test_ActiveCandidate (1m1s) // Nope + if timeout { + if timedOutTestNameRe.MatchString(output) { + matchTimedOutTestName := timedOutTestNameRe.FindStringSubmatch(output) + if len(matchTimedOutTestName) > 1 { + testName := strings.TrimSpace(matchTimedOutTestName[1]) + testDurationStr := strings.TrimSpace(matchTimedOutTestName[2]) + testDuration, err := time.ParseDuration(testDurationStr) + if err != nil { + return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrDetectedTimeoutFailedParse, output) + } + if testDuration >= timeoutDuration { + return testName, true, nil + } + } + } + } else { + matchNestedTestName := nestedTestNameRe.FindStringSubmatch(output) + if len(matchNestedTestName) > 1 { + return strings.TrimSpace(matchNestedTestName[1]), false, nil + } } } - return "", false, fmt.Errorf("failed to attribute panic to test, using regex '%s' and '%s' on these strings:\n\n%s", - nestedTestNameRe.String(), timedOutTestNamRe.String(), strings.Join(outputs, "")) + // If we reach here, we couldn't attribute the panic to a test in the loop + + if timeout { + return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrDetectedTimeoutFailedAttribution, strings.Join(outputs, "")) + } + + return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToAttributePanicToTest, strings.Join(outputs, "")) } // attributeRaceToTest properly attributes races to the test that caused them. diff --git a/tools/flakeguard/runner/runner_test.go b/tools/flakeguard/runner/runner_test.go index 7918ac0ec..f8d6259e4 100644 --- a/tools/flakeguard/runner/runner_test.go +++ b/tools/flakeguard/runner/runner_test.go @@ -443,13 +443,54 @@ func TestAttributePanicToTest(t *testing.T) { }, }, { - name: "timeout panic", + name: "log after test complete panic", + expectedTestName: "Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing", + expectedTimeout: false, + outputs: []string{ + "panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: 2025-03-28T17:18:16.703Z\tDEBUG\tCapabilitiesRegistry\tcapabilities/registry.go:69\tget capability\t{\"version\": \"unset@unset\", \"id\": \"basic-test-trigger@1.0.0\"}", + "goroutine 646 [running]:", + "testing.(*common).logDepth(0xc000728000, {0xc0001b9400, 0x9b}, 0x3)", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1064 +0x69f", + "testing.(*common).log(...)", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1046", + "testing.(*common).Logf(0xc000728000, {0x6000752, 0x2}, {0xc001070430, 0x1, 0x1})", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1097 +0x9f", + "go.uber.org/zap/zaptest.TestingWriter.Write({{0x7fb811aa2818?, 0xc000728000?}, 0x20?}, {0xc001074000, 0x9c, 0x400})", + "\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/zaptest/logger.go:146 +0x11d", + "go.uber.org/zap/zapcore.(*ioCore).Write(0xc000bff1d0, {0xff, {0xc1f1d45629e99436, 0x252667087c, 0x87b3fa0}, {0x602a730, 0x14}, {0x601d42f, 0xe}, {0x1, ...}, ...}, ...)", + "\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/zapcore/core.go:99 +0x18e", + "go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc00106dba0, {0xc00101d400, 0x1, 0x2})", + "\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/zapcore/entry.go:253 +0x1ed", + "go.uber.org/zap.(*SugaredLogger).log(0xc0001e48b8, 0xff, {0x601d42f, 0xe}, {0x0, 0x0, 0x0}, {0xc00101bf40, 0x2, 0x2})", + "\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/sugar.go:355 +0x12d", + "go.uber.org/zap.(*SugaredLogger).Debugw(...)", + "\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/sugar.go:251", + "github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).Get(0xc000ab88c0, {0x20?, 0x87bb320?}, {0xc0013282a0, 0x18})", + "\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:69 +0x1cf", + "github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).GetTrigger(0xc000ab88c0, {0x67d38a8, 0xc0011f22d0}, {0xc0013282a0, 0x18})", + "\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:80 +0x6f", + "github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).resolveWorkflowCapabilities(0xc000e75188, {0x67d38a8, 0xc0011f22d0})", + "\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:198 +0x173", + "github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init.func1()", + "\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:348 +0x2aa", + "github.com/smartcontractkit/chainlink/v2/core/services/workflows.retryable({0x67d38a8, 0xc0011f22d0}, {0x680c850, 0xc000e08210}, 0x1388, 0x0, 0xc000f0bf08)", + "\t/home/runner/work/chainlink/chainlink/core/services/workflows/retry.go:45 +0x402", + "github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init(0xc000e75188, {0x67d38a8, 0xc0011f22d0})", + "\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:339 +0x225", + "created by github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).Start.func1 in goroutine 390", + "\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:179 +0xf37", + "FAIL\tgithub.com/smartcontractkit/chainlink/v2/core/services/workflows/syncer\t159.643s", + }, + }, + { + name: "timeout panic with obvious culprit", expectedTestName: "TestTimedOut", expectedTimeout: true, outputs: []string{ "panic: test timed out after 10m0s", "running tests", - "TestTimedOut (10m0s)", + "\tTestNoTimeout (9m59s)", + "\tTestTimedOut (10m0s)", "goroutine 397631 [running]:", "testing.(*M).startAlarm.func1()", "\t/opt/hostedtoolcache/go/1.23.3/x64/src/testing/testing.go:2373 +0x385", @@ -575,11 +616,15 @@ func TestFailToAttributePanicToTest(t *testing.T) { t.Parallel() testCases := []struct { - name string - outputs []string + name string + expectedTimeout bool + expectedError error + outputs []string }{ { - name: "no test name in panic", + name: "no test name in panic", + expectedTimeout: false, + expectedError: ErrFailedToAttributePanicToTest, outputs: []string{ "panic: reflect: Elem of invalid type bool", "goroutine 104182 [running]:", @@ -617,7 +662,105 @@ func TestFailToAttributePanicToTest(t *testing.T) { }, }, { - name: "possible regex trip-up", + name: "fail to parse timeout duration", + expectedTimeout: true, + expectedError: ErrFailedToParseTimeoutDuration, + outputs: []string{ + "panic: test timed out after malformedDurationStr\n", + "\trunning tests:\n", + "\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n", + "\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n", + "\t\tTestAlmostPanicTime (9m59s)\n", + "\t\tTestConnectNewChain (1m1s)\n", + "\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n", + "\t\tTestJobSpecChangeset (0s)\n", + "\t\tTest_ActiveCandidate (1m1s)\n", + "goroutine 971967 [running]:\n", + "testing.(*M).startAlarm.func1()\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n", + "created by time.goFunc\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n", + "goroutine 1 [chan receive]:\n", + "testing.tRunner.func1()\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n", + "testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n", + "testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n", + "testing.(*M).Run(0xc0014732c0)\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n", + "main.main()\n", + "\t_testmain.go:119 +0x165\n", + }, + }, + { + name: "fail to parse test duration", + expectedTimeout: true, + expectedError: ErrDetectedTimeoutFailedParse, + outputs: []string{ + "panic: test timed out after 10m0s\n", + "\trunning tests:\n", + "\t\tTestAddAndPromoteCandidatesForNewChain (malformedDurationStr)\n", + "\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n", + "\t\tTestAlmostPanicTime (9m59s)\n", + "\t\tTestConnectNewChain (1m1s)\n", + "\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n", + "\t\tTestJobSpecChangeset (0s)\n", + "\t\tTest_ActiveCandidate (1m1s)\n", + "goroutine 971967 [running]:\n", + "testing.(*M).startAlarm.func1()\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n", + "created by time.goFunc\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n", + "goroutine 1 [chan receive]:\n", + "testing.tRunner.func1()\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n", + "testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n", + "testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n", + "testing.(*M).Run(0xc0014732c0)\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n", + "main.main()\n", + "\t_testmain.go:119 +0x165\n", + }, + }, + { + name: "timeout panic without obvious culprit", + expectedTimeout: true, + expectedError: ErrDetectedTimeoutFailedAttribution, + outputs: []string{ + "panic: test timed out after 10m0s\n", + "\trunning tests:\n", + "\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n", + "\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n", + "\t\tTestAlmostPanicTime (9m59s)\n", + "\t\tTestConnectNewChain (1m1s)\n", + "\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n", + "\t\tTestJobSpecChangeset (0s)\n", + "\t\tTest_ActiveCandidate (1m1s)\n", + "goroutine 971967 [running]:\n", + "testing.(*M).startAlarm.func1()\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n", + "created by time.goFunc\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n", + "goroutine 1 [chan receive]:\n", + "testing.tRunner.func1()\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n", + "testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n", + "testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n", + "testing.(*M).Run(0xc0014732c0)\n", + "\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n", + "main.main()\n", + "\t_testmain.go:119 +0x165\n", + }, + }, + { + name: "possible regex trip-up", + expectedTimeout: false, + expectedError: ErrFailedToAttributePanicToTest, outputs: []string{ "panic: runtime error: invalid memory address or nil pointer dereference\n", "[signal SIGSEGV: segmentation violation code=0x1 addr=0x18 pc=0x21589cc]\n", @@ -759,9 +902,10 @@ func TestFailToAttributePanicToTest(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { testName, timeout, err := attributePanicToTest(tc.outputs) + assert.Equal(t, tc.expectedTimeout, timeout, "timeout flag mismatch") require.Error(t, err) + assert.ErrorIs(t, err, tc.expectedError, "error mismatch") assert.Empty(t, testName, "test name should be empty") - assert.False(t, timeout, "timeout flag should be false") }) } }