Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 99 additions & 19 deletions tools/flakeguard/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,9 @@ func (r *Runner) parseTestResults(runPrefix string, runCount int) ([]reports.Tes
}
panicTest, timeout, err := attributePanicToTest(outputs)
if err != nil {
log.Warn().Err(err).Msg("Unable to attribute panic to a test")
panicTest = "UnableToAttributePanicTestPleaseInvestigate"
log.Error().Msg("Unable to attribute panic to a test")
fmt.Println(err.Error())
panicTest = "UnableToAttributePanicToTestPleaseInvestigate"
}
panicTestKey := fmt.Sprintf("%s/%s", entryLine.Package, panicTest)

Expand Down Expand Up @@ -612,27 +613,106 @@ func (r *Runner) transformTestOutputFiles(filePaths []string) error {
return nil
}

var (
ErrFailedToAttributePanicToTest = errors.New("failed to attribute panic to test")
ErrFailedToParseTimeoutDuration = errors.New("failed to parse timeout duration")
ErrFailedToExtractTimeoutDuration = errors.New("failed to extract timeout duration")
ErrDetectedLogAfterCompleteFailedAttribution = errors.New("detected a log after test has completed panic, but failed to properly attribute it")
ErrDetectedTimeoutFailedParse = errors.New("detected test timeout, but failed to parse the duration from the test")
ErrDetectedTimeoutFailedAttribution = errors.New("detected test timeout, but failed to attribute the timeout to a specific test")
)

// attributePanicToTest properly attributes panics to the test that caused them.
// There are a lot of edge cases and strange behavior in Go test output when it comes to panics.
func attributePanicToTest(outputs []string) (test string, timeout bool, err error) {
// Regex to extract a valid test function name from a panic message, e.g.
// github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00)
nestedTestNameRe := regexp.MustCompile(`\.(Test[A-Z]\w+)(?:\.[^(]+)?\s*\(`)
// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
// TestTimedOut (10m0s)
timedOutTestNamRe := regexp.MustCompile(`^(Test\w+)\W+\(.*\)$`)

for _, o := range outputs {
outputs = append(outputs, o)
matchNestedTestName := nestedTestNameRe.FindStringSubmatch(o)
matchTimedOutTestName := timedOutTestNamRe.FindStringSubmatch(o)
if len(matchNestedTestName) > 1 {
return strings.TrimSpace(matchNestedTestName[1]), false, nil
} else if len(matchTimedOutTestName) > 1 {
return strings.TrimSpace(matchTimedOutTestName[1]), true, nil
var (
// Regex to extract a valid test function name from a panic message.
// This is the most common situation for test panics, e.g.
// github.com/smartcontractkit/chainlink/deployment/keystone/changeset_test.TestDeployBalanceReader(0xc000583c00)
nestedTestNameRe = regexp.MustCompile(`\.(Test[^\s]+?)(?:\.[^(]+)?\s*\(`)

// Regex to check if the panic is from a log after a goroutine, e.g.
// panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: <Log line>
testLogAfterTestRe = regexp.MustCompile(`^panic: Log in goroutine after (Test[^\s]+) has completed:`)

// Check if the panic message indicates a timeout, e.g.
// panic: test timed out after 10m0s
didTestTimeoutRe = regexp.MustCompile(`^panic: test timed out after ([^\s]+)`)
// Regex to extract a valid test function name from a panic message if the panic is a timeout, e.g.
// TestTimedOut (10m0s)
timedOutTestNameRe = regexp.MustCompile(`^(Test[^\s]+)\s+\((.*)\)`)
timeoutDurationStr string
timeoutDuration time.Duration
)

for _, output := range outputs {
output = strings.TrimSpace(output)
// Check if the panic message indicates a timeout
// If so, extract the timeout duration and switch to timeout mode
if didTestTimeoutRe.MatchString(output) {
timeout = true
if len(didTestTimeoutRe.FindStringSubmatch(output)) > 1 {
timeoutDurationStr = didTestTimeoutRe.FindStringSubmatch(output)[1]
timeoutDuration, err = time.ParseDuration(timeoutDurationStr)
if err != nil {
return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrFailedToParseTimeoutDuration, output)
}
} else {
return "", true, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToExtractTimeoutDuration, output)
}
}

if testLogAfterTestRe.MatchString(output) {
// If the panic message indicates a log after a test, extract the test name
match := testLogAfterTestRe.FindStringSubmatch(output)
if len(match) > 1 {
testName := strings.TrimSpace(match[1])
return testName, timeout, nil
} else {
return "", false, fmt.Errorf(
"%w using this output:\n\n%s", ErrDetectedLogAfterCompleteFailedAttribution, strings.Join(outputs, ""),
)
}
}

// If in timeout mode, look for test names in the panic message and check if any match the timeout duration, e.g.
// panic: test timed out after 10m0s
// running tests:
// TestAddAndPromoteCandidatesForNewChain (22s) // Nope
// TestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s) // Nope
// TestTimeout (10m0s) // Yes
// TestConnectNewChain/Use_production_router_(with_MCMS) (1m1s) // Nope
// TestJobSpecChangeset (0s) // Nope
// Test_ActiveCandidate (1m1s) // Nope
if timeout {
if timedOutTestNameRe.MatchString(output) {
matchTimedOutTestName := timedOutTestNameRe.FindStringSubmatch(output)
if len(matchTimedOutTestName) > 1 {
testName := strings.TrimSpace(matchTimedOutTestName[1])
testDurationStr := strings.TrimSpace(matchTimedOutTestName[2])
testDuration, err := time.ParseDuration(testDurationStr)
if err != nil {
return "", true, fmt.Errorf("%w: %w using this output:\n\n%s", err, ErrDetectedTimeoutFailedParse, output)
}
if testDuration >= timeoutDuration {
return testName, true, nil
}
}
}
} else {
matchNestedTestName := nestedTestNameRe.FindStringSubmatch(output)
if len(matchNestedTestName) > 1 {
return strings.TrimSpace(matchNestedTestName[1]), false, nil
}
}
}
return "", false, fmt.Errorf("failed to attribute panic to test, using regex '%s' and '%s' on these strings:\n\n%s",
nestedTestNameRe.String(), timedOutTestNamRe.String(), strings.Join(outputs, ""))
// If we reach here, we couldn't attribute the panic to a test in the loop

if timeout {
return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrDetectedTimeoutFailedAttribution, strings.Join(outputs, ""))
}

return "", timeout, fmt.Errorf("%w using this output:\n\n%s", ErrFailedToAttributePanicToTest, strings.Join(outputs, ""))
}

// attributeRaceToTest properly attributes races to the test that caused them.
Expand Down
158 changes: 151 additions & 7 deletions tools/flakeguard/runner/runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,13 +443,54 @@ func TestAttributePanicToTest(t *testing.T) {
},
},
{
name: "timeout panic",
name: "log after test complete panic",
expectedTestName: "Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing",
expectedTimeout: false,
outputs: []string{
"panic: Log in goroutine after Test_workflowRegisteredHandler/skips_fetch_if_secrets_url_is_missing has completed: 2025-03-28T17:18:16.703Z\tDEBUG\tCapabilitiesRegistry\tcapabilities/registry.go:69\tget capability\t{\"version\": \"unset@unset\", \"id\": \"[email protected]\"}",
"goroutine 646 [running]:",
"testing.(*common).logDepth(0xc000728000, {0xc0001b9400, 0x9b}, 0x3)",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1064 +0x69f",
"testing.(*common).log(...)",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1046",
"testing.(*common).Logf(0xc000728000, {0x6000752, 0x2}, {0xc001070430, 0x1, 0x1})",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1097 +0x9f",
"go.uber.org/zap/zaptest.TestingWriter.Write({{0x7fb811aa2818?, 0xc000728000?}, 0x20?}, {0xc001074000, 0x9c, 0x400})",
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zaptest/logger.go:146 +0x11d",
"go.uber.org/zap/zapcore.(*ioCore).Write(0xc000bff1d0, {0xff, {0xc1f1d45629e99436, 0x252667087c, 0x87b3fa0}, {0x602a730, 0x14}, {0x601d42f, 0xe}, {0x1, ...}, ...}, ...)",
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zapcore/core.go:99 +0x18e",
"go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc00106dba0, {0xc00101d400, 0x1, 0x2})",
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/zapcore/entry.go:253 +0x1ed",
"go.uber.org/zap.(*SugaredLogger).log(0xc0001e48b8, 0xff, {0x601d42f, 0xe}, {0x0, 0x0, 0x0}, {0xc00101bf40, 0x2, 0x2})",
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/sugar.go:355 +0x12d",
"go.uber.org/zap.(*SugaredLogger).Debugw(...)",
"\t/home/runner/go/pkg/mod/go.uber.org/[email protected]/sugar.go:251",
"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).Get(0xc000ab88c0, {0x20?, 0x87bb320?}, {0xc0013282a0, 0x18})",
"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:69 +0x1cf",
"github.com/smartcontractkit/chainlink/v2/core/capabilities.(*Registry).GetTrigger(0xc000ab88c0, {0x67d38a8, 0xc0011f22d0}, {0xc0013282a0, 0x18})",
"\t/home/runner/work/chainlink/chainlink/core/capabilities/registry.go:80 +0x6f",
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).resolveWorkflowCapabilities(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:198 +0x173",
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init.func1()",
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:348 +0x2aa",
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.retryable({0x67d38a8, 0xc0011f22d0}, {0x680c850, 0xc000e08210}, 0x1388, 0x0, 0xc000f0bf08)",
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/retry.go:45 +0x402",
"github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).init(0xc000e75188, {0x67d38a8, 0xc0011f22d0})",
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:339 +0x225",
"created by github.com/smartcontractkit/chainlink/v2/core/services/workflows.(*Engine).Start.func1 in goroutine 390",
"\t/home/runner/work/chainlink/chainlink/core/services/workflows/engine.go:179 +0xf37",
"FAIL\tgithub.com/smartcontractkit/chainlink/v2/core/services/workflows/syncer\t159.643s",
},
},
{
name: "timeout panic with obvious culprit",
expectedTestName: "TestTimedOut",
expectedTimeout: true,
outputs: []string{
"panic: test timed out after 10m0s",
"running tests",
"TestTimedOut (10m0s)",
"\tTestNoTimeout (9m59s)",
"\tTestTimedOut (10m0s)",
"goroutine 397631 [running]:",
"testing.(*M).startAlarm.func1()",
"\t/opt/hostedtoolcache/go/1.23.3/x64/src/testing/testing.go:2373 +0x385",
Expand Down Expand Up @@ -575,11 +616,15 @@ func TestFailToAttributePanicToTest(t *testing.T) {
t.Parallel()

testCases := []struct {
name string
outputs []string
name string
expectedTimeout bool
expectedError error
outputs []string
}{
{
name: "no test name in panic",
name: "no test name in panic",
expectedTimeout: false,
expectedError: ErrFailedToAttributePanicToTest,
outputs: []string{
"panic: reflect: Elem of invalid type bool",
"goroutine 104182 [running]:",
Expand Down Expand Up @@ -617,7 +662,105 @@ func TestFailToAttributePanicToTest(t *testing.T) {
},
},
{
name: "possible regex trip-up",
name: "fail to parse timeout duration",
expectedTimeout: true,
expectedError: ErrFailedToParseTimeoutDuration,
outputs: []string{
"panic: test timed out after malformedDurationStr\n",
"\trunning tests:\n",
"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
"\t\tTestAlmostPanicTime (9m59s)\n",
"\t\tTestConnectNewChain (1m1s)\n",
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
"\t\tTestJobSpecChangeset (0s)\n",
"\t\tTest_ActiveCandidate (1m1s)\n",
"goroutine 971967 [running]:\n",
"testing.(*M).startAlarm.func1()\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
"created by time.goFunc\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
"goroutine 1 [chan receive]:\n",
"testing.tRunner.func1()\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
"testing.(*M).Run(0xc0014732c0)\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
"main.main()\n",
"\t_testmain.go:119 +0x165\n",
},
},
{
name: "fail to parse test duration",
expectedTimeout: true,
expectedError: ErrDetectedTimeoutFailedParse,
outputs: []string{
"panic: test timed out after 10m0s\n",
"\trunning tests:\n",
"\t\tTestAddAndPromoteCandidatesForNewChain (malformedDurationStr)\n",
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
"\t\tTestAlmostPanicTime (9m59s)\n",
"\t\tTestConnectNewChain (1m1s)\n",
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
"\t\tTestJobSpecChangeset (0s)\n",
"\t\tTest_ActiveCandidate (1m1s)\n",
"goroutine 971967 [running]:\n",
"testing.(*M).startAlarm.func1()\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
"created by time.goFunc\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
"goroutine 1 [chan receive]:\n",
"testing.tRunner.func1()\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
"testing.(*M).Run(0xc0014732c0)\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
"main.main()\n",
"\t_testmain.go:119 +0x165\n",
},
},
{
name: "timeout panic without obvious culprit",
expectedTimeout: true,
expectedError: ErrDetectedTimeoutFailedAttribution,
outputs: []string{
"panic: test timed out after 10m0s\n",
"\trunning tests:\n",
"\t\tTestAddAndPromoteCandidatesForNewChain (22s)\n",
"\t\tTestAddAndPromoteCandidatesForNewChain/Remote_chains_owned_by_MCMS (22s)\n",
"\t\tTestAlmostPanicTime (9m59s)\n",
"\t\tTestConnectNewChain (1m1s)\n",
"\t\tTestConnectNewChain/Use_production_router_(with_MCMS) (1m1s)\n",
"\t\tTestJobSpecChangeset (0s)\n",
"\t\tTest_ActiveCandidate (1m1s)\n",
"goroutine 971967 [running]:\n",
"testing.(*M).startAlarm.func1()\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2484 +0x605\n",
"created by time.goFunc\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/time/sleep.go:215 +0x45\n",
"goroutine 1 [chan receive]:\n",
"testing.tRunner.func1()\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1753 +0x965\n",
"testing.tRunner(0xc0013dac40, 0xc0025b7ae0)\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:1798 +0x25f\n",
"testing.runTests(0xc0010a0b70, {0x14366840, 0x25, 0x25}, {0x3?, 0x0?, 0x146214a0?})\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2277 +0x96d\n",
"testing.(*M).Run(0xc0014732c0)\n",
"\t/opt/hostedtoolcache/go/1.24.0/x64/src/testing/testing.go:2142 +0xeeb\n",
"main.main()\n",
"\t_testmain.go:119 +0x165\n",
},
},
{
name: "possible regex trip-up",
expectedTimeout: false,
expectedError: ErrFailedToAttributePanicToTest,
outputs: []string{
"panic: runtime error: invalid memory address or nil pointer dereference\n",
"[signal SIGSEGV: segmentation violation code=0x1 addr=0x18 pc=0x21589cc]\n",
Expand Down Expand Up @@ -759,9 +902,10 @@ func TestFailToAttributePanicToTest(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
testName, timeout, err := attributePanicToTest(tc.outputs)
assert.Equal(t, tc.expectedTimeout, timeout, "timeout flag mismatch")
require.Error(t, err)
assert.ErrorIs(t, err, tc.expectedError, "error mismatch")
assert.Empty(t, testName, "test name should be empty")
assert.False(t, timeout, "timeout flag should be false")
})
}
}
Expand Down
Loading