Skip to content

Commit f016035

Browse files
committed
roachtest: unconditionally save clusters that show raft fatal errors
When a cluster's logs contain a raft panic, it will be extended (by a week), volume snapshots will be taken, and the cluster will not be destroyed. This gives us the artifacts for a thorough investigation. Verified manually via: ``` run --local acceptance/invariant-check-detection/failed=true ``` Here is the (editorialized) output: ``` test-teardown: 2025/05/20 08:15:15 cluster.go:2559: running cmd `([ -d logs ] && grep -RE '^...` on nodes [:1-4]; details in run_081515.744363000_n1-4_d-logs-grep-RE-Fraft.log test-teardown: 2025/05/20 08:15:16 cluster.go:2995: extending cluster by 168h0m0s test-teardown: 2025/05/20 08:15:16 cluster.go:1104: saving cluster local [tag:] (4 nodes) for debugging (--debug specified) test-teardown: 2025/05/20 08:15:16 test_impl.go:478: test failure #2: full stack retained in failure_2.log: (test_runner.go:1705).maybeSaveClusterDueToInvariantProblems: invariant problem - snap name invariant-problem-local-8897676895823393049: logs/foo.log:F250502 11:37:20.387424 1036 raft/raft.go:2411 ⋮ [T1,Vsystem,n1,s1,r155/1:?/Table/113/1/{43/578…-51/201…}?] 80 match(30115) is out of range [lastIndex(30114)]. Was the raft log corrupted, truncated, or lost? ``` Closes #145953. Informs #146617. Informs #138028. Epic: none
1 parent dbf5d47 commit f016035

File tree

2 files changed

+85
-1
lines changed

2 files changed

+85
-1
lines changed

pkg/cmd/roachtest/test_runner.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1615,6 +1615,11 @@ func (r *testRunner) postTestAssertions(
16151615
func (r *testRunner) teardownTest(
16161616
ctx context.Context, t *testImpl, c *clusterImpl, timedOut bool,
16171617
) error {
1618+
// Check for rare conditions (such as storage durability crashes) at this
1619+
// point. This may still mark the test as failed (so that we enter artifacts
1620+
// collection below).
1621+
r.maybeSaveClusterDueToInvariantProblems(ctx, t, c)
1622+
16181623
if timedOut || t.Failed() || roachtestflags.AlwaysCollectArtifacts {
16191624
err := r.collectArtifacts(ctx, t, c, timedOut, time.Hour)
16201625
if err != nil {
@@ -1669,6 +1674,43 @@ func (r *testRunner) teardownTest(
16691674
return nil
16701675
}
16711676

1677+
// maybeSaveClusterDueToInvariantProblems detects rare conditions (such as
1678+
// storage durability crashes) on the cluster and if one is detected,
1679+
// unconditionally preserves the cluster for future debugging. It also creates
1680+
// volume snapshots so that the durable state close to the incident is
1681+
// preserved.
1682+
func (r *testRunner) maybeSaveClusterDueToInvariantProblems(
1683+
ctx context.Context, t *testImpl, c *clusterImpl,
1684+
) {
1685+
if len(c.Nodes()) == 0 {
1686+
return // test only
1687+
}
1688+
dets, err := c.RunWithDetails(ctx, t.L(), option.WithNodes(c.All()),
1689+
"([ -d logs ] && grep -RE '^F.*raft' logs) || true",
1690+
)
1691+
for _, det := range dets {
1692+
err = errors.CombineErrors(err, det.Err)
1693+
}
1694+
if err != nil {
1695+
t.L().Printf(
1696+
"failed to check whether to save cluster due to invariant problems: %s",
1697+
err,
1698+
)
1699+
return
1700+
}
1701+
1702+
for _, det := range dets {
1703+
if det.Stdout != "" {
1704+
_ = c.Extend(ctx, 7*24*time.Hour, t.L())
1705+
snapName := "invariant-problem-" + c.Name() + "-" + strconv.Itoa(rand.Int())
1706+
_, _ = c.CreateSnapshot(ctx, snapName)
1707+
c.Save(ctx, "invariant problem - snap name "+snapName, t.L())
1708+
t.Error("invariant problem - snap name " + snapName + ":\n" + det.Stdout)
1709+
return
1710+
}
1711+
}
1712+
}
1713+
16721714
func (r *testRunner) collectArtifacts(
16731715
ctx context.Context, t *testImpl, c *clusterImpl, timedOut bool, timeout time.Duration,
16741716
) error {

pkg/cmd/roachtest/tests/acceptance.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@ import (
1111
"time"
1212

1313
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
14+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1415
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
1516
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1617
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
18+
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
1719
"github.com/cockroachdb/errors"
20+
"github.com/stretchr/testify/require"
1821
)
1922

2023
func registerAcceptance(r registry.Registry) {
@@ -81,6 +84,22 @@ func registerAcceptance(r registry.Registry) {
8184
// and version upgrade is impossible to test as of 05/2025.
8285
incompatibleClouds: registry.OnlyIBM,
8386
},
87+
// Tests for maybeSaveClusterDueToInvariantProblems. These don't verify
88+
// that everything works as it should, but they can be run to verify
89+
// manually that the cluster is saved correctly and the log output is
90+
// helpful.
91+
{
92+
name: "invariant-check-detection/failed=true",
93+
fn: runInvariantCheckDetectionFailed,
94+
timeout: time.Hour,
95+
skip: "manual only",
96+
},
97+
{
98+
name: "invariant-check-detection/failed=false",
99+
fn: runInvariantCheckDetectionSuccess,
100+
timeout: time.Hour,
101+
skip: "manual only",
102+
},
84103
},
85104
registry.OwnerDisasterRecovery: {
86105
{
@@ -145,7 +164,9 @@ func registerAcceptance(r registry.Registry) {
145164
tc.name,
146165
))
147166
}
148-
suites := append([]string{registry.Nightly, registry.Quick, registry.Acceptance}, tc.suites...)
167+
var suites []string
168+
suites = append(suites, registry.Nightly, registry.Quick, registry.Acceptance)
169+
suites = append(suites, tc.suites...)
149170
testSpec := registry.TestSpec{
150171
Name: "acceptance/" + tc.name,
151172
Owner: owner,
@@ -171,3 +192,24 @@ func registerAcceptance(r registry.Registry) {
171192
}
172193
}
173194
}
195+
196+
func runInvariantCheckDetectionFailed(ctx context.Context, t test.Test, c cluster.Cluster) {
197+
runInvariantCheckDetection(ctx, t, c, true)
198+
}
199+
200+
func runInvariantCheckDetectionSuccess(ctx context.Context, t test.Test, c cluster.Cluster) {
201+
runInvariantCheckDetection(ctx, t, c, false)
202+
}
203+
204+
func runInvariantCheckDetection(ctx context.Context, t test.Test, c cluster.Cluster, failed bool) {
205+
c.Put(ctx, t.Cockroach(), "cockroach")
206+
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.Range(1, 3))
207+
require.NoError(t, c.PutString(ctx, `
208+
foo br baz
209+
F250502 11:37:20.387424 1036 raft/raft.go:2411 ⋮ [T1,Vsystem,n1,s1,r155/1:‹/Table/113/1/{43/578…-51/201…}›] 80 match(30115) is out of range [lastIndex(30114)]. Was the raft log corrupted, truncated, or lost?
210+
asdasds
211+
`, "logs/foo.log", 0644, c.Node(2)))
212+
if failed {
213+
t.Error("boom")
214+
}
215+
}

0 commit comments

Comments
 (0)