Skip to content

Commit 0a44fb9

Browse files
craig[bot]srosenbergherkolategan
committed
148861: roachtest: add `s390x-test-failure` GH label when arch == "s390x" r=darrylwong,rickystewart a=srosenberg To simplify tracking roachtest failures on s390x, we add `s390x-test-failure` Github label. Epic: none Release note: None 148892: roachtest: fix test monitor error r=srosenberg a=herkolategan Previously, the global test monitor would fatal on an error. This is similar to how the original monitor worked. But the original monitor would fatal where a panic could be recovered by the goroutine running the test. This would immediately short circuit the test. However, the global monitor can not use fatal since it's running in a separate goroutine which causes the panic to be unrecoverable. This change makes the global monitor call `Errorf` instead of `Fatalf`. The test context will now instead be cancelled. Epic: None Release note: None Co-authored-by: Stan Rosenberg <[email protected]> Co-authored-by: Herko Lategan <[email protected]>
3 parents f904a99 + 2809c93 + 603c888 commit 0a44fb9

File tree

5 files changed

+97
-8
lines changed

5 files changed

+97
-8
lines changed

pkg/cmd/roachtest/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ go_test(
9898
"main_test.go",
9999
"test_filter_test.go",
100100
"test_impl_test.go",
101+
"test_monitor_test.go",
101102
"test_registry_test.go",
102103
"test_test.go",
103104
"zip_util_test.go",

pkg/cmd/roachtest/github.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ func (g *githubIssues) createPostRequest(
217217
const infraFlakeLabel = "X-infra-flake"
218218
const runtimeAssertionsLabel = "B-runtime-assertions-enabled"
219219
const coverageLabel = "B-coverage-enabled"
220+
const s390xTestFailureLabel = "s390x-test-failure"
220221
labels := []string{"O-roachtest"}
221222
if infraFlake {
222223
labels = append(labels, infraFlakeLabel)
@@ -235,6 +236,11 @@ func (g *githubIssues) createPostRequest(
235236
labels = append(labels, coverageLabel)
236237
}
237238
}
239+
// N.B. To simplify tracking failures on s390x, we add the designated s390x-test-failure label. This could be removed
240+
// in the future, i.e., after several major releases, when we expect s390x to be sufficiently stable.
241+
if arch := params["arch"]; vm.CPUArch(arch) == vm.ArchS390x {
242+
labels = append(labels, s390xTestFailureLabel)
243+
}
238244
labels = append(labels, spec.ExtraLabels...)
239245

240246
teams, err := g.teamLoader()

pkg/cmd/roachtest/test_monitor.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,28 @@ import (
1111
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
1212
)
1313

14-
var _ test.Monitor = &testMonitorImpl{}
15-
1614
type testMonitorImpl struct {
17-
*monitorImpl
15+
monitor interface {
16+
test.Monitor
17+
WaitForNodeDeath() error
18+
}
19+
t test.Test
1820
}
1921

20-
func newTestMonitor(ctx context.Context, t test.Test, c *clusterImpl) *testMonitorImpl {
22+
// newTestMonitor is a function that creates a new test monitor. It can be used for testing
23+
// purposes to replace the default monitor with a custom implementation.
24+
var newTestMonitor = func(ctx context.Context, t test.Test, c *clusterImpl) *testMonitorImpl {
2125
return &testMonitorImpl{
22-
monitorImpl: newMonitor(ctx, t, c, true /* expectExactProcessDeath */),
26+
monitor: newMonitor(ctx, t, c, true /* expectExactProcessDeath */),
27+
t: t,
2328
}
2429
}
2530

2631
func (m *testMonitorImpl) start() {
2732
go func() {
28-
err := m.WaitForNodeDeath()
33+
err := m.monitor.WaitForNodeDeath()
2934
if err != nil {
30-
m.t.Fatal(err)
35+
m.t.Errorf("test monitor: %v", err)
3136
}
3237
}()
3338
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package main
7+
8+
import (
9+
"context"
10+
"testing"
11+
"time"
12+
13+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
14+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
15+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
16+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
17+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
18+
"github.com/cockroachdb/cockroach/pkg/util/stop"
19+
"github.com/cockroachdb/errors"
20+
"github.com/stretchr/testify/require"
21+
)
22+
23+
type stubTestMonitorError struct{}
24+
25+
func (s *stubTestMonitorError) ExpectProcessDead(
26+
nodes option.NodeListOption, opts ...option.OptionFunc,
27+
) {
28+
}
29+
30+
func (s *stubTestMonitorError) ExpectProcessAlive(
31+
nodes option.NodeListOption, opts ...option.OptionFunc,
32+
) {
33+
}
34+
35+
func (s *stubTestMonitorError) WaitForNodeDeath() error {
36+
return errors.New("test error")
37+
}
38+
39+
func TestGlobalMonitorError(t *testing.T) {
40+
newTestMonitor = func(_ context.Context, t test.Test, c *clusterImpl) *testMonitorImpl {
41+
return &testMonitorImpl{
42+
monitor: &stubTestMonitorError{},
43+
t: t,
44+
}
45+
}
46+
47+
ctx := context.Background()
48+
stopper := stop.NewStopper()
49+
defer stopper.Stop(ctx)
50+
cr := newClusterRegistry()
51+
runner := newUnitTestRunner(cr, stopper)
52+
53+
var buf syncedBuffer
54+
copt := defaultClusterOpt()
55+
lopt := defaultLoggingOpt(&buf)
56+
57+
mockTest := registry.TestSpec{
58+
Name: `mock test`,
59+
Owner: OwnerUnitTest,
60+
Cluster: spec.MakeClusterSpec(0),
61+
CompatibleClouds: registry.AllExceptAWS,
62+
Suites: registry.Suites(registry.Nightly),
63+
CockroachBinary: registry.StandardCockroach,
64+
Monitor: true,
65+
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
66+
select {
67+
case <-ctx.Done():
68+
return
69+
case <-time.After(5 * time.Minute):
70+
return
71+
}
72+
},
73+
}
74+
err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
75+
defaultParallelism, copt, testOpts{}, lopt)
76+
require.Error(t, err)
77+
}

pkg/cmd/roachtest/test_runner.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1351,7 +1351,7 @@ func (r *testRunner) runTest(
13511351

13521352
t.taskManager = task.NewManager(runCtx, t.L())
13531353
testMonitor := newTestMonitor(runCtx, t, c)
1354-
t.monitor = testMonitor
1354+
t.monitor = testMonitor.monitor
13551355

13561356
t.mu.Lock()
13571357
// t.Fatal() will cancel this context.

0 commit comments

Comments
 (0)