From 08fde368c2e8db3760222419d4ff3adee416eba3 Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Fri, 28 Mar 2025 17:11:25 -0400
Subject: [PATCH] Accounts for more panic scenarios

---
 tools/flakeguard/runner/runner.go      | 118 +++++++++++++++---
 tools/flakeguard/runner/runner_test.go | 158 +++++++++++++++++++++++--
 2 files changed, 250 insertions(+), 26 deletions(-)
diff --git a/tools/flakeguard/runner/runner.go b/tools/flakeguard/runner/runner.go
index c07630f93..48b35ac0d 100644
--- a/tools/flakeguard/runner/runner.go
+++ b/tools/flakeguard/runner/runner.go
@@ -383,8 +383,9 @@ func (r *Runner) parseTestResults(runPrefix string, runCount int) ([]reports.Tes
 					}
 					panicTest, timeout, err := attributePanicToTest(outputs)
 					if err != nil {
-						log.Warn().Err(err).Msg("Unable to attribute panic to a test")
-						panicTest = "UnableToAttributePanicTestPleaseInvestigate"
+						log.Error().Msg("Unable to attribute panic to a test")
+						fmt.Println(err.Error())
+						panicTest = "UnableToAttributePanicToTestPleaseInvestigate"
 					}
 					panicTestKey := fmt.Sprintf("%s/%s", entryLine.Package, panicTest)
 
@@ -612,27 +613,106 @@ func (r *Runner) transformTestOutputFiles(filePaths []string) error {
 	return nil
 }
 
+var (
+	ErrFailedToAttributePanicToTest              = errors.New("failed to attribute panic to test")
+	ErrFailedToParseTimeoutDuration              = errors.New("failed to parse timeout duration")
+	ErrFailedToExtractTimeoutDuration            = errors.New("failed to extract timeout duration")
+	ErrDetectedLogAfterCompleteFailedAttribution = errors.New("detected a log after test has completed panic, but failed to properly attribute it")
+	ErrDetectedTimeoutFailedParse                = errors.New("detected test timeout, but failed to parse the duration from the test")
+	ErrDetectedTimeoutFailedAttribution          = errors.New("detected test timeout, but failed to attribute the timeout to a specific test")
+)
+
 // attributePanicToTest properly attributes panics to the test that caused them.
+// There are a lot of edge cases and strange behavior in Go test output when it comes to panics.
 func attributePanicToTest(outputs []string) (test string, timeout bool, err error) {
-	// Regex to extract a valid test function name from a panic message, e.g.
-	// github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00)
-	nestedTestNameRe := regexp.MustCompile(`\.(Test[A-Z]\w+)(?:\.[^(]+)?\s*\(`)
-	// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
-	// TestTimedOut (10m0s)
-	timedOutTestNamRe := regexp.MustCompile(`^(Test\w+)\W+\(.*\)$`)
-
-	for _, o := range outputs {
-		outputs = append(outputs, o)
-		matchNestedTestName := nestedTestNameRe.FindStringSubmatch(o)
-		matchTimedOutTestName := timedOutTestNamRe.FindStringSubmatch(o)
-		if len(matchNestedTestName) > 1 {
-			return strings.TrimSpace(matchNestedTestName[1]), false, nil
-		} else if len(matchTimedOutTestName) > 1 {
-			return strings.TrimSpace(matchTimedOutTestName[1]), true, nil
+	var (
+		// Regex to extract a valid test function name from a panic message.
+		// This is the most common situation for test panics, e.g.
+		// github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00)
+		nestedTestNameRe = regexp.MustCompile(`\.(Test[^\s]+?)(?:\.[^(]+)?\s*\(`)
+
+		// Regex to check if the panic is from a log after a goroutine, e.g.
+		// panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: <Log line>
+		testLogAfterTestRe = regexp.MustCompile(`^panic: Log in goroutine after (Test[^\s]+) has completed:`)
+
+		// Check if the panic message indicates a timeout, e.g.
+		// panic: test timed out after 10m0s
+		didTestTimeoutRe = regexp.MustCompile(`^panic: test timed out after ([^\s]+)`)
+		// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
+		// TestTimedOut (10m0s)
+		timedOutTestNameRe = regexp.MustCompile(`^(Test[^\s]+)\s+\((.*)\)`)
+		timeoutDurationStr string
+		timeoutDuration    time.Duration
+	)
+
+	for _, output := range outputs {
+		output = strings.TrimSpace(output)
+		// Check if the panic message indicates a timeout
+		// If so, extract the timeout duration and switch to timeout mode
+		if didTestTimeoutRe.MatchString(output) {
+			timeout = true
+			if len(didTestTimeoutRe.FindStringSubmatch(output)) > 1 {
+				timeoutDurationStr = didTestTimeoutRe.FindStringSubmatch(output)[1]
+				timeoutDuration, err = time.ParseDuration(timeoutDurationStr)
+				if err != nil {
+					return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrFailedToParseTimeoutDuration, output)
+				}
+			} else {
+				return "", true, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToExtractTimeoutDuration, output)
+			}
+		}
+
+		if testLogAfterTestRe.MatchString(output) {
+			// If the panic message indicates a log after a test, extract the test name
+			match := testLogAfterTestRe.FindStringSubmatch(output)
+			if len(match) > 1 {
+				testName := strings.TrimSpace(match[1])
+				return testName, timeout, nil
+			} else {
+				return "", false, fmt.Errorf(
+					"%w using this output:\n\n%s", ErrDetectedLogAfterCompleteFailedAttribution, strings.Join(outputs, ""),
+				)
+			}
+		}
+
+		// If in timeout mode, look for test names in the panic message and check if any match the timeout duration, e.g.
+		// 	panic: test timed out after 10m0s
+		//   running tests:
+		//     TestAddAndPromoteCandidatesForNewChain (22s) // Nope
+		//     TestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s) // Nope
+		//     TestTimeout (10m0s) // Yes
+		//     TestConnectNewChain/Use_production_router_(with_MCMS) (1m1s) // Nope
+		//     TestJobSpecChangeset (0s) // Nope
+		//     Test_ActiveCandidate (1m1s) // Nope
+		if timeout {
+			if timedOutTestNameRe.MatchString(output) {
+				matchTimedOutTestName := timedOutTestNameRe.FindStringSubmatch(output)
+				if len(matchTimedOutTestName) > 1 {
+					testName := strings.TrimSpace(matchTimedOutTestName[1])
+					testDurationStr := strings.TrimSpace(matchTimedOutTestName[2])
+					testDuration, err := time.ParseDuration(testDurationStr)
+					if err != nil {
+						return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrDetectedTimeoutFailedParse, output)
+					}
+					if testDuration >= timeoutDuration {
+						return testName, true, nil
+					}
+				}
+			}
+		} else {
+			matchNestedTestName := nestedTestNameRe.FindStringSubmatch(output)
+			if len(matchNestedTestName) > 1 {
+				return strings.TrimSpace(matchNestedTestName[1]), false, nil
+			}
 		}
 	}
-	return "", false, fmt.Errorf("failed to attribute panic to test, using regex '%s' and '%s' on these strings:\n\n%s",
-		nestedTestNameRe.String(), timedOutTestNamRe.String(), strings.Join(outputs, ""))
+	// If we reach here, we couldn't attribute the panic to a test in the loop
+
+	if timeout {
+		return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrDetectedTimeoutFailedAttribution, strings.Join(outputs, ""))
+	}
+
+	return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToAttributePanicToTest, strings.Join(outputs, ""))
 }
 
 // attributeRaceToTest properly attributes races to the test that caused them.
diff --git a/tools/flakeguard/runner/runner_test.go b/tools/flakeguard/runner/runner_test.go
index 7918ac0ec..f8d6259e4 100644
--- a/tools/flakeguard/runner/runner_test.go
+++ b/tools/flakeguard/runner/runner_test.go
@@ -443,13 +443,54 @@ func TestAttributePanicToTest(t *testing.T) {
 			},
 		},
 		{
-			name:             "timeout panic",
+			name:             "log after test complete panic",
+			expectedTestName: "Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing",
+			expectedTimeout:  false,
+			outputs: []string{
+				"panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: 2025-03-28T17:18:16.703Z\tDEBUG\tCapabilitiesRegistry\tcapabilities/registry.go:69\tget capability\t{\"version\": \"unset@unset\", \"id\": \"basic-test-trigger@1.0.0\"}",
+				"goroutine 646 [running]:",
+				"testing.(*common).logDepth(0xc000728000, {0xc0001b9400, 0x9b}, 0x3)",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1064 +0x69f",
+				"testing.(*common).log(...)",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1046",
+				"testing.(*common).Logf(0xc000728000, {0x6000752, 0x2}, {0xc001070430, 0x1, 0x1})",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1097 +0x9f",
+				"go.uber.org/zap/zaptest.TestingWriter.Write({{0x7fb811aa2818?, 0xc000728000?}, 0x20?}, {0xc001074000, 0x9c, 0x400})",
+				"\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/zaptest/logger.go:146 +0x11d",
+				"go.uber.org/zap/zapcore.(*ioCore).Write(0xc000bff1d0, {0xff, {0xc1f1d45629e99436, 0x252667087c, 0x87b3fa0}, {0x602a730, 0x14}, {0x601d42f, 0xe}, {0x1, ...}, ...}, ...)",
+				"\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/zapcore/core.go:99 +0x18e",
+				"go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc00106dba0, {0xc00101d400, 0x1, 0x2})",
+				"\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/zapcore/entry.go:253 +0x1ed",
+				"go.uber.org/zap.(*SugaredLogger).log(0xc0001e48b8, 0xff, {0x601d42f, 0xe}, {0x0, 0x0, 0x0}, {0xc00101bf40, 0x2, 0x2})",
+				"\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/sugar.go:355 +0x12d",
+				"go.uber.org/zap.(*SugaredLogger).Debugw(...)",
+				"\t/home/runner/go/pkg/mod/go.uber.org/zap@v1.27.0/sugar.go:251",
+				"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).Get(0xc000ab88c0, {0x20?, 0x87bb320?}, {0xc0013282a0, 0x18})",
+				"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:69 +0x1cf",
+				"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).GetTrigger(0xc000ab88c0, {0x67d38a8, 0xc0011f22d0}, {0xc0013282a0, 0x18})",
+				"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:80 +0x6f",
+				"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).resolveWorkflowCapabilities(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
+				"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:198 +0x173",
+				"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init.func1()",
+				"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:348 +0x2aa",
+				"github.com/smartcontractkit/chainlink/v2/core/services/workflows.retryable({0x67d38a8, 0xc0011f22d0}, {0x680c850, 0xc000e08210}, 0x1388, 0x0, 0xc000f0bf08)",
+				"\t/home/runner/work/chainlink/chainlink/core/services/workflows/retry.go:45 +0x402",
+				"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
+				"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:339 +0x225",
+				"created by github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).Start.func1 in goroutine 390",
+				"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:179 +0xf37",
+				"FAIL\tgithub.com/smartcontractkit/chainlink/v2/core/services/workflows/syncer\t159.643s",
+			},
+		},
+		{
+			name:             "timeout panic with obvious culprit",
 			expectedTestName: "TestTimedOut",
 			expectedTimeout:  true,
 			outputs: []string{
 				"panic: test timed out after 10m0s",
 				"running tests",
-				"TestTimedOut (10m0s)",
+				"\tTestNoTimeout (9m59s)",
+				"\tTestTimedOut (10m0s)",
 				"goroutine 397631 [running]:",
 				"testing.(*M).startAlarm.func1()",
 				"\t/opt/hostedtoolcache/go/1.23.3/x64/src/testing/testing.go:2373 +0x385",
@@ -575,11 +616,15 @@ func TestFailToAttributePanicToTest(t *testing.T) {
 	t.Parallel()
 
 	testCases := []struct {
-		name    string
-		outputs []string
+		name            string
+		expectedTimeout bool
+		expectedError   error
+		outputs         []string
 	}{
 		{
-			name: "no test name in panic",
+			name:            "no test name in panic",
+			expectedTimeout: false,
+			expectedError:   ErrFailedToAttributePanicToTest,
 			outputs: []string{
 				"panic: reflect: Elem of invalid type bool",
 				"goroutine 104182 [running]:",
@@ -617,7 +662,105 @@ func TestFailToAttributePanicToTest(t *testing.T) {
 			},
 		},
 		{
-			name: "possible regex trip-up",
+			name:            "fail to parse timeout duration",
+			expectedTimeout: true,
+			expectedError:   ErrFailedToParseTimeoutDuration,
+			outputs: []string{
+				"panic: test timed out after malformedDurationStr\n",
+				"\trunning tests:\n",
+				"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
+				"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
+				"\t\tTestAlmostPanicTime (9m59s)\n",
+				"\t\tTestConnectNewChain (1m1s)\n",
+				"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
+				"\t\tTestJobSpecChangeset (0s)\n",
+				"\t\tTest_ActiveCandidate (1m1s)\n",
+				"goroutine 971967 [running]:\n",
+				"testing.(*M).startAlarm.func1()\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
+				"created by time.goFunc\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
+				"goroutine 1 [chan receive]:\n",
+				"testing.tRunner.func1()\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
+				"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
+				"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
+				"testing.(*M).Run(0xc0014732c0)\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
+				"main.main()\n",
+				"\t_testmain.go:119 +0x165\n",
+			},
+		},
+		{
+			name:            "fail to parse test duration",
+			expectedTimeout: true,
+			expectedError:   ErrDetectedTimeoutFailedParse,
+			outputs: []string{
+				"panic: test timed out after 10m0s\n",
+				"\trunning tests:\n",
+				"\t\tTestAddAndPromoteCandidatesForNewChain (malformedDurationStr)\n",
+				"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
+				"\t\tTestAlmostPanicTime (9m59s)\n",
+				"\t\tTestConnectNewChain (1m1s)\n",
+				"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
+				"\t\tTestJobSpecChangeset (0s)\n",
+				"\t\tTest_ActiveCandidate (1m1s)\n",
+				"goroutine 971967 [running]:\n",
+				"testing.(*M).startAlarm.func1()\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
+				"created by time.goFunc\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
+				"goroutine 1 [chan receive]:\n",
+				"testing.tRunner.func1()\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
+				"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
+				"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
+				"testing.(*M).Run(0xc0014732c0)\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
+				"main.main()\n",
+				"\t_testmain.go:119 +0x165\n",
+			},
+		},
+		{
+			name:            "timeout panic without obvious culprit",
+			expectedTimeout: true,
+			expectedError:   ErrDetectedTimeoutFailedAttribution,
+			outputs: []string{
+				"panic: test timed out after 10m0s\n",
+				"\trunning tests:\n",
+				"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
+				"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
+				"\t\tTestAlmostPanicTime (9m59s)\n",
+				"\t\tTestConnectNewChain (1m1s)\n",
+				"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
+				"\t\tTestJobSpecChangeset (0s)\n",
+				"\t\tTest_ActiveCandidate (1m1s)\n",
+				"goroutine 971967 [running]:\n",
+				"testing.(*M).startAlarm.func1()\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
+				"created by time.goFunc\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
+				"goroutine 1 [chan receive]:\n",
+				"testing.tRunner.func1()\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
+				"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
+				"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
+				"testing.(*M).Run(0xc0014732c0)\n",
+				"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
+				"main.main()\n",
+				"\t_testmain.go:119 +0x165\n",
+			},
+		},
+		{
+			name:            "possible regex trip-up",
+			expectedTimeout: false,
+			expectedError:   ErrFailedToAttributePanicToTest,
 			outputs: []string{
 				"panic: runtime error: invalid memory address or nil pointer dereference\n",
 				"[signal SIGSEGV: segmentation violation code=0x1 addr=0x18 pc=0x21589cc]\n",
@@ -759,9 +902,10 @@ func TestFailToAttributePanicToTest(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			testName, timeout, err := attributePanicToTest(tc.outputs)
+			assert.Equal(t, tc.expectedTimeout, timeout, "timeout flag mismatch")
 			require.Error(t, err)
+			assert.ErrorIs(t, err, tc.expectedError, "error mismatch")
 			assert.Empty(t, testName, "test name should be empty")
-			assert.False(t, timeout, "timeout flag should be false")
 		})
 	}
 }