Skip to content

Commit e73afad

Browse files
committed
roachtest: extract Fatal-level log messages for easier triage
Currently, when triaging an issue that originates from a node that has failed with a fatal error (regardless if it was being watched by a monitor or not), you get a generic error message that will most likely require you to go to TeamCity and download and inspect the logs. As mentioned in the linked issue, a simple grep on the node's logs can help to identify the issue quickly and there are cases where the roachtest failure can be categorized as an infra related flake (e.g. clock sync). Now after the teardown test step after downloading the artifacts, we have a new inspectArtifacts step where we will search node logs for any fatal logs and if found, we will log to test.log and append those logs to the error message that is sent to the github poster. Then in pkg/cmd/bazci/githubpost/issues/formatter_unit.go, a new formatter will check for this message and if found, will format the markdown such that the logs are in a separate code block for better readability. Resolves #147360 Epic: None Release note: None
1 parent df1fb6a commit e73afad

File tree

7 files changed

+259
-1
lines changed

7 files changed

+259
-1
lines changed

pkg/cmd/bazci/githubpost/issues/condense.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ type RSGCrash struct {
4949
Schema string // the schema that the crash was induced with
5050
}
5151

52+
// FatalNodeRoachtest contains a fatal error from a node from a roachtest
53+
type FatalNodeRoachtest struct {
54+
Message,
55+
FatalLogs string
56+
}
57+
5258
// A CondensedMessage is a test log output garnished with useful helper methods
5359
// that extract concise information for seamless debugging.
5460
type CondensedMessage string
@@ -61,6 +67,8 @@ var fatalRE = regexp.MustCompile(`(?ms)(^F\d{6}.*?\n)(goroutine \d+.*?\n)\n`)
6167
var crasherRE = regexp.MustCompile(`(?s)( *rsg_test.go:\d{3}: Crash detected:.*?\n)(.*?;\n)`)
6268
var reproRE = regexp.MustCompile(`(?s)( *rsg_test.go:\d{3}: To reproduce, use schema:)`)
6369

70+
var roachtestNodeFatalRE = regexp.MustCompile(`(?ms)\A(.*?\n)((?:^F\d{6}\b[^\n]*(?:\n|$))+)`)
71+
6472
// FatalOrPanic constructs a FatalOrPanic. If no fatal or panic occurred in the
6573
// test, ok=false is returned.
6674
func (s CondensedMessage) FatalOrPanic(numPrecedingLines int) (fop FatalOrPanic, ok bool) {
@@ -98,6 +106,23 @@ func (s CondensedMessage) RSGCrash(lineLimit int) (c RSGCrash, ok bool) {
98106
return RSGCrash{}, false
99107
}
100108

109+
// FatalNodeRoachtest constructs a FatalNodeRoachtest which is used to
110+
// construct an issue with node fatal logs in a Roachtest. If not found, or if
111+
// regex matching doesn't return the exact expected number of matches,
112+
// ok=false is returned
113+
func (s CondensedMessage) FatalNodeRoachtest() (fnr FatalNodeRoachtest, ok bool) {
114+
ss := string(s)
115+
if matches := roachtestNodeFatalRE.FindStringSubmatchIndex(ss); matches != nil {
116+
if len(matches) != 6 {
117+
return FatalNodeRoachtest{}, false
118+
}
119+
fnr.Message = ss[matches[2] : matches[3]-1]
120+
fnr.FatalLogs = ss[matches[4]:matches[5]]
121+
return fnr, true
122+
}
123+
return FatalNodeRoachtest{}, false
124+
}
125+
101126
// String calls .Digest(30).
102127
func (s CondensedMessage) String() string {
103128
return s.Digest(30)

pkg/cmd/bazci/githubpost/issues/formatter_unit.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ func (unitTestFormatterTyp) Body(r *Renderer, data TemplateData) error {
6969
r.Escaped("Schema:")
7070
r.CodeBlock("", rsgCrash.Schema)
7171
}
72+
} else if fnr, ok := data.CondensedMessage.FatalNodeRoachtest(); ok {
73+
r.Escaped("Failed with:")
74+
r.CodeBlock("", fnr.Message)
75+
r.Escaped("Fatal entries found in Cockroach logs:")
76+
r.CodeBlock("", fnr.FatalLogs)
7277
} else {
7378
r.CodeBlock("", data.CondensedMessage.Digest(50))
7479
}

pkg/cmd/roachtest/github_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,11 @@ func TestCreatePostRequest(t *testing.T) {
225225
case "lose-error-object":
226226
// Lose the error object which should make our flake detection fail.
227227
refError = errors.Newf("%s", redact.SafeString(refError.Error()))
228+
case "node-fatal":
229+
refError = errors.Newf(`(monitor.go:267).Wait: monitor failure: dial tcp 127.0.0.1:29000: connect: connection refused
230+
test artifacts and logs in: artifacts/roachtest/manual/monitor/test-failure/node-fatal-explicit-monitor/cpu_arch=arm64/run_1
231+
F250826 19:49:07.194443 3106 sql/sem/builtins/builtins.go:6063 ⋮ [T1,Vsystem,n1,client=127.0.0.1:54552,hostssl,user=‹roachprod›] 250 force_log_fatal(): ‹oops›
232+
`)
228233
}
229234
}
230235
}

pkg/cmd/roachtest/test_impl.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ type testImpl struct {
140140
// parameters if there is a failure. They will additionally be logged in the test itself
141141
// in case github issue posting is disabled.
142142
extraParams map[string]string
143+
144+
// githubMessage contains additional message information that will be
145+
// passed to github.MaybePost
146+
githubMessage string
143147
}
144148
// Map from version to path to the cockroach binary to be used when
145149
// mixed-version test wants a binary for that binary. If a particular version
@@ -558,6 +562,18 @@ func (t *testImpl) failureMsg() string {
558562
return b.String()
559563
}
560564

565+
func (t *testImpl) getGithubMessage() string {
566+
t.mu.RLock()
567+
defer t.mu.RUnlock()
568+
return t.mu.githubMessage
569+
}
570+
571+
func (t *testImpl) appendGithubMessage(msg string) {
572+
t.mu.Lock()
573+
defer t.mu.Unlock()
574+
t.mu.githubMessage += msg
575+
}
576+
561577
// failuresMatchingError checks whether the first error in trees of
562578
// any of the errors in the failures passed match the `refError`
563579
// target. If it does, `refError` is set to that target error value

pkg/cmd/roachtest/test_runner.go

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"net"
1818
"net/http"
1919
"os"
20+
"os/exec"
2021
"path/filepath"
2122
"runtime"
2223
"sort"
@@ -1268,8 +1269,12 @@ func (r *testRunner) runTest(
12681269

12691270
output := fmt.Sprintf("%s\ntest artifacts and logs in: %s", failureMsg, t.ArtifactsDir())
12701271
params := getTestParameters(t, issueInfo.cluster, issueInfo.vmCreateOpts)
1272+
githubMsg := output
1273+
if testGithubMsg := t.getGithubMessage(); testGithubMsg != "" {
1274+
githubMsg = fmt.Sprintf("%s\n%s", output, testGithubMsg)
1275+
}
12711276
logTestParameters(l, params)
1272-
issue, err := github.MaybePost(t, issueInfo, l, output, params)
1277+
issue, err := github.MaybePost(t, issueInfo, l, githubMsg, params)
12731278
if err != nil {
12741279
shout(ctx, l, stdout, "failed to post issue: %s", err)
12751280
atomic.AddInt32(&r.numGithubPostErrs, 1)
@@ -1490,10 +1495,18 @@ func (r *testRunner) runTest(
14901495
// From now on, all logging goes to test-teardown.log to give a clear separation between
14911496
// operations originating from the test vs the harness. The only error that can originate here
14921497
// is from artifact collection, which is best effort and for which we do not fail the test.
1498+
// TODO(wchoe): improve log destination consistency, above comment doesn't take deferred calls into account
1499+
// testRunner.runTest's deferred calls write to the original test.log, not test-teardown.log
1500+
// and the deferred calls aren't necessarily related to test teardown so the
1501+
// correct log to write to is ambiguous
14931502
replaceLogger("test-teardown")
14941503
if err := r.teardownTest(ctx, t, c, timedOut); err != nil {
14951504
l.PrintfCtx(ctx, "error during test teardown: %v; see test-teardown.log for details", err)
14961505
}
1506+
if err := r.inspectArtifacts(ctx, t, l); err != nil {
1507+
// inspect artifacts and potentially add helpful triage information for failed tests
1508+
l.PrintfCtx(ctx, "error during artifact inspection: %v", err)
1509+
}
14971510
}
14981511

14991512
// getVMNames returns a comma separated list of VM names.
@@ -1723,6 +1736,76 @@ func (r *testRunner) teardownTest(
17231736
return nil
17241737
}
17251738

1739+
// inspectArtifacts inspects node logs and attempts to write helpful triage
1740+
// information to the test log and testRunner.githubMessage
1741+
// This method is best effort and should not fail a test.
1742+
// This method writes to both testLogger which is expected to be test.log and
1743+
// t.L() which is test-teardown.log since inspectArtifacts is called after
1744+
// teardownTest
1745+
func (r *testRunner) inspectArtifacts(
1746+
ctx context.Context, t *testImpl, testLogger *logger.Logger,
1747+
) error {
1748+
1749+
if t.Failed() || roachtestflags.AlwaysCollectArtifacts {
1750+
t.L().Printf("Attempting to gather node fatal level logs for triage.")
1751+
out, err := gatherFatalNodeLogs(t, testLogger)
1752+
if err != nil {
1753+
return err
1754+
}
1755+
if out == "" {
1756+
t.L().Printf("No fatal level logs found.")
1757+
return nil
1758+
} else {
1759+
testLogger.PrintfCtx(ctx, "CockroachDB contains Fatal level logs. Up to the first 10 "+
1760+
"will be shown here. See node logs in artifacts for more details.\n%s", out)
1761+
t.appendGithubMessage(out)
1762+
return nil
1763+
}
1764+
}
1765+
return nil
1766+
}
1767+
1768+
// gatherFatalNodeLogs attempts to gather fatal level node logs to help with
1769+
// triage
1770+
func gatherFatalNodeLogs(t *testImpl, testLogger *logger.Logger) (string, error) {
1771+
logPattern := `^F[0-9]{6}`
1772+
filePattern := "logs/*unredacted/cockroach*.log"
1773+
// *unredacted captures patterns for single node and multi-node clusters
1774+
// e.g. unredacted, 1.unredacted
1775+
joinedFilePath := filepath.Join(t.ArtifactsDir(), filePattern)
1776+
targetFiles, err := filepath.Glob(joinedFilePath)
1777+
if err != nil {
1778+
return "", err
1779+
} else if len(targetFiles) == 0 {
1780+
return "", errors.Newf("No matching log files found for log pattern: %s and file pattern: %s",
1781+
logPattern, filePattern)
1782+
}
1783+
args := append([]string{"-E", "-m", "10", "-a", logPattern}, targetFiles...)
1784+
command := "grep"
1785+
t.L().Printf("Gathering fatal level logs with command: %q %s", command, strings.Join(args, " "))
1786+
// Works with local and remote node clusters because we will always download
1787+
// the artifacts if there's a test failure (except for timeout)
1788+
cmd := exec.Command("grep", args...)
1789+
out, err := cmd.CombinedOutput()
1790+
if err != nil {
1791+
var ee *exec.ExitError
1792+
if errors.As(err, &ee) && ee.ExitCode() == 1 {
1793+
testLogger.Printf("No fatal level logs found.")
1794+
// Not finding files isn't necessarily an error so don't return an error
1795+
return "", nil
1796+
}
1797+
return "", err
1798+
}
1799+
// trim file path from output for readability
1800+
lines := strings.Split(string(out), "\n")
1801+
for i, line := range lines {
1802+
if idx := strings.IndexByte(line, ':'); idx >= 0 {
1803+
lines[i] = strings.TrimLeft(line[idx+1:], " \t")
1804+
}
1805+
}
1806+
return strings.Join(lines, "\n"), err
1807+
}
1808+
17261809
// maybeSaveClusterDueToInvariantProblems detects rare conditions (such as
17271810
// storage durability crashes) on the cluster and if one is detected,
17281811
// unconditionally preserves the cluster for future debugging. It also creates
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Test failure due to node fatal should include fatal node logs in github issue
2+
3+
add-failure name=(oops) type=(node-fatal)
4+
----
5+
ok
6+
7+
post
8+
----
9+
----
10+
roachtest.github_test [failed]() on test_branch @ [test_SHA]():
11+
12+
Failed with:
13+
14+
```
15+
(monitor.go:267).Wait: monitor failure: dial tcp 127.0.0.1:29000: connect: connection refused
16+
test artifacts and logs in: artifacts/roachtest/manual/monitor/test-failure/node-fatal-explicit-monitor/cpu_arch=arm64/run_1
17+
```
18+
Fatal entries found in Cockroach logs:
19+
20+
```
21+
F250826 19:49:07.194443 3106 sql/sem/builtins/builtins.go:6063 ⋮ [T1,Vsystem,n1,client=127.0.0.1:54552,hostssl,user=?roachprod?] 250 force_log_fatal(): ?oops?
22+
```
23+
24+
Parameters:
25+
- <code>arch=amd64</code>
26+
- <code>cloud=gce</code>
27+
- <code>coverageBuild=false</code>
28+
- <code>cpu=4</code>
29+
- <code>encrypted=false</code>
30+
- <code>fs=ext4</code>
31+
- <code>localSSD=true</code>
32+
- <code>runtimeAssertionsBuild=false</code>
33+
- <code>ssd=0</code>
34+
<details><summary>Help</summary>
35+
<p>
36+
37+
38+
See: [roachtest README](https://github.com/cockroachdb/cockroach/blob/master/pkg/cmd/roachtest/README.md)
39+
40+
41+
42+
See: [How To Investigate \(internal\)](https://cockroachlabs.atlassian.net/l/c/SSSBr8c7)
43+
44+
45+
46+
See: [Grafana](https://go.crdb.dev/roachtest-grafana//github-test/1689957243000/1689957853000)
47+
48+
</p>
49+
</details>
50+
/cc @cockroachdb/unowned
51+
<sub>
52+
53+
[This test on roachdash](https://roachdash.crdb.dev/?filter=status:open%20t:.*github_test.*&sort=title+created&display=lastcommented+project) | [Improve this report!](https://github.com/cockroachdb/cockroach/tree/master/pkg/cmd/bazci/githubpost/issues)
54+
55+
</sub>
56+
57+
------
58+
Labels:
59+
- <code>O-roachtest</code>
60+
- <code>C-test-failure</code>
61+
- <code>release-blocker</code>
62+
Rendered:
63+
https://github.com/cockroachdb/cockroach/issues/new?body=roachtest.github_test+%5Bfailed%5D%28%29+on+test_branch+%40+%5Btest_SHA%5D%28%29%3A%0A%0AFailed+with%3A%0A%0A%60%60%60%0A%28monitor.go%3A267%29.Wait%3A+monitor+failure%3A+dial+tcp+127.0.0.1%3A29000%3A+connect%3A+connection+refused%0Atest+artifacts+and+logs+in%3A+artifacts%2Froachtest%2Fmanual%2Fmonitor%2Ftest-failure%2Fnode-fatal-explicit-monitor%2Fcpu_arch%3Darm64%2Frun_1%0A%60%60%60%0AFatal+entries+found+in+Cockroach+logs%3A%0A%0A%60%60%60%0AF250826+19%3A49%3A07.194443+3106+sql%2Fsem%2Fbuiltins%2Fbuiltins.go%3A6063+%E2%8B%AE+%5BT1%2CVsystem%2Cn1%2Cclient%3D127.0.0.1%3A54552%2Chostssl%2Cuser%3D%3Froachprod%3F%5D+250++force_log_fatal%28%29%3A+%3Foops%3F%0A%60%60%60%0A%0AParameters%3A%0A+-+%3Ccode%3Earch%3Damd64%3C%2Fcode%3E%0A+-+%3Ccode%3Ecloud%3Dgce%3C%2Fcode%3E%0A+-+%3Ccode%3EcoverageBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3Ecpu%3D4%3C%2Fcode%3E%0A+-+%3Ccode%3Eencrypted%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3Efs%3Dext4%3C%2Fcode%3E%0A+-+%3Ccode%3ElocalSSD%3Dtrue%3C%2Fcode%3E%0A+-+%3Ccode%3EruntimeAssertionsBuild%3Dfalse%3C%2Fcode%3E%0A+-+%3Ccode%3Essd%3D0%3C%2Fcode%3E%0A%3Cdetails%3E%3Csummary%3EHelp%3C%2Fsummary%3E%0A%3Cp%3E%0A%0A%0ASee%3A+%5Broachtest+README%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Fblob%2Fmaster%2Fpkg%2Fcmd%2Froachtest%2FREADME.md%29%0A%0A%0A%0ASee%3A+%5BHow+To+Investigate+%5C%28internal%5C%29%5D%28https%3A%2F%2Fcockroachlabs.atlassian.net%2Fl%2Fc%2FSSSBr8c7%29%0A%0A%0A%0ASee%3A+%5BGrafana%5D%28https%3A%2F%2Fgo.crdb.dev%2Froachtest-grafana%2F%2Fgithub-test%2F1689957243000%2F1689957853000%29%0A%0A%3C%2Fp%3E%0A%3C%2Fdetails%3E%0A%2Fcc+%40cockroachdb%2Funowned%0A%3Csub%3E%0A%0A%5BThis+test+on+roachdash%5D%28https%3A%2F%2Froachdash.crdb.dev%2F%3Ffilter%3Dstatus%3Aopen%2520t%3A.%2Agithub_test.%2A%26sort%3Dtitle%2Bcreated%26display%3Dlastcommented%2Bproject%29+%7C+%5BImprove+this+report%21%5D%28https%3A%2F%2Fgithub.com%2Fcockroachdb%2Fcockroach%2Ftree%2Fmaster%2Fpkg%2Fcmd%2Fbazci%2Fgithubpost%2Fissues%29%0A%0A%3C%2Fsub%3E%0A%0A------%0ALabels%3A%0A-+%3Ccode%3EO-roachtest%3C%2Fcode%3E%0A-+%3Ccode%3EC-test-failure%3C%2Fcode%3E%0A-+%3Ccode%3Erelease-blocker%3C%2Fcode%3E%0A&template=none&title=roachtest%3A+github_test+failed
64+
----
65+
----

pkg/cmd/roachtest/tests/roachtest.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
1616
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
1717
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
18+
"github.com/stretchr/testify/require"
1819
)
1920

2021
func registerRoachtest(r registry.Registry) {
@@ -53,4 +54,62 @@ func registerRoachtest(r registry.Registry) {
5354
Timeout: 3 * time.Minute,
5455
Cluster: r.MakeClusterSpec(3),
5556
})
57+
58+
// Manual test for verifying framework behavior in a test failure scenario
59+
// via unexpected node fatal error from an explicitly monitored node
60+
r.Add(registry.TestSpec{
61+
Name: "roachtest/manual/monitor/test-failure/node-fatal-explicit-monitor",
62+
Owner: registry.OwnerTestEng,
63+
Cluster: r.MakeClusterSpec(1),
64+
CompatibleClouds: registry.AllClouds,
65+
Suites: registry.ManualOnly,
66+
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
67+
monitorFatalTest(ctx, t, c)
68+
},
69+
})
70+
71+
// Manual test for verifying framework behavior in a test failure scenario
72+
// via unexpected node fatal error using the roachtest built-in node monitor
73+
r.Add(registry.TestSpec{
74+
Name: "roachtest/manual/monitor/test-failure/node-fatal-global-monitor",
75+
Owner: registry.OwnerTestEng,
76+
Cluster: r.MakeClusterSpec(1),
77+
CompatibleClouds: registry.AllClouds,
78+
Suites: registry.ManualOnly,
79+
Monitor: true,
80+
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
81+
monitorFatalTestGlobal(ctx, t, c)
82+
},
83+
})
84+
}
85+
86+
// monitorFatalTest will always fail with a node logging a fatal error in a
87+
// goroutine that is being watched by a monitor
88+
func monitorFatalTest(ctx context.Context, t test.Test, c cluster.Cluster) {
89+
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings())
90+
m := c.NewDeprecatedMonitor(ctx, c.Node(1))
91+
n1 := c.Conn(ctx, t.L(), 1)
92+
defer n1.Close()
93+
require.NoError(t, n1.PingContext(ctx))
94+
95+
m.Go(func(ctx context.Context) (err error) {
96+
_, err = n1.ExecContext(ctx, "SELECT crdb_internal.force_log_fatal('oops');")
97+
return err
98+
})
99+
m.Wait()
100+
}
101+
102+
// monitorFatalTestGlobal will always fail with a node logging a fatal error
103+
// not within an explicit goroutine. Expects registry.TestSpec.Monitor to be
104+
// set to True
105+
func monitorFatalTestGlobal(ctx context.Context, t test.Test, c cluster.Cluster) {
106+
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings())
107+
n1 := c.Conn(ctx, t.L(), 1)
108+
defer n1.Close()
109+
require.NoError(t, n1.PingContext(ctx))
110+
111+
_, err := n1.ExecContext(ctx, "SELECT crdb_internal.force_log_fatal('oops');")
112+
if err != nil {
113+
t.L().Printf("Error executing query: %s", err)
114+
}
56115
}

0 commit comments

Comments
 (0)