Skip to content

Commit 7302942

Browse files
craig[bot]srosenberg
andcommitted
Merge #148244
148244: roachtest: jepsen returns with exit code 255 when encountering unhandled exceptions r=darrylwong a=srosenberg Jepsen exits with `255` if an uhandled exception is raised. The error code is typically associated with SSH errors. Thus, the framework ends up misclassifying it as an SSH flake. This change wraps the jepsen runner script with a remapping of the exit code so that `255` is now mapped into `254`. In the process of testing the change, a couple of other issues were uncovered. `FetchLogs` now correctly skips downloading non-existent logs. Also, as of [1], Jepsen should be compiled and executed with JDK 1.17+, so we updated `tc-nightly-main` [2]. [1] cockroachdb/jepsen@d3e8f55 [2] cockroachdb/jepsen@50a97a1 Epic: none Fixes: #99681 Release note: None Co-authored-by: Stan Rosenberg <[email protected]>
2 parents c595470 + d69129e commit 7302942

File tree

5 files changed

+63
-28
lines changed

5 files changed

+63
-28
lines changed

pkg/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ ALL_TESTS = [
328328
"//pkg/roachprod/vm/ibm:ibm_test",
329329
"//pkg/roachprod/vm/local:local_test",
330330
"//pkg/roachprod/vm:vm_test",
331+
"//pkg/roachprod:roachprod_disallowed_imports_test",
331332
"//pkg/roachprod:roachprod_test",
332333
"//pkg/rpc/nodedialer:nodedialer_test",
333334
"//pkg/rpc:rpc_test",

pkg/cmd/roachtest/test/test_interface.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ import (
1616
// DefaultCockroachPath is the path where the binary passed to the
1717
// `--cockroach` flag will be made available in every node in the
1818
// cluster.
19+
// N.B. This constant is duplicated from roachtest to break build dependency.
20+
// Thus, any update should be reflected in both places.
1921
const DefaultCockroachPath = "./cockroach"
2022

2123
// DefaultDeprecatedWorkloadPath is the path where the binary passed

pkg/cmd/roachtest/tests/jepsen.go

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ a pull request for tc-nightly-main branch and after merging build a new
4848
artifact using:
4949
5050
# install build dependencies and build tools
51-
sudo apt-get -qqy install openjdk-8-jre openjdk-8-jre-headless libjna-java gnuplot
51+
sudo apt-get -qqy install openjdk-17-jre openjdk-17-jre-headless libjna-java gnuplot
5252
curl -o lein https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein
5353
chmod +x lein
5454
@@ -74,7 +74,7 @@ to running roachtest and update repository URLs in the file to your liking.
7474
const envBuildJepsen = "ROACHTEST_BUILD_JEPSEN"
7575

7676
const jepsenRepo = "https://github.com/cockroachdb/jepsen"
77-
const repoBranch = "tc-nightly"
77+
const repoBranch = "tc-nightly-main"
7878

7979
const gcpPath = "https://storage.googleapis.com/cockroach-jepsen"
8080
const binaryVersion = "0.1.0-6699eb4-standalone"
@@ -119,7 +119,7 @@ func initJepsen(ctx context.Context, t test.Test, c cluster.Cluster, j jepsenCon
119119
// so do it before the initialization check for ease of iteration.
120120
if err := c.GitClone(
121121
ctx, t.L(),
122-
"https://github.com/cockroachdb/jepsen", "/mnt/data1/jepsen", "tc-nightly", controller,
122+
jepsenRepo, "/mnt/data1/jepsen", repoBranch, controller,
123123
); err != nil {
124124
t.Fatal(err)
125125
}
@@ -163,10 +163,11 @@ func initJepsen(ctx context.Context, t test.Test, c cluster.Cluster, j jepsenCon
163163
// (which is not how our official builds are laid out).
164164
c.Run(ctx, option.WithNodes(c.All()), "tar --transform s,^,cockroach/, -c -z -f cockroach.tgz cockroach")
165165

166+
deps := `"sudo DEBIAN_FRONTEND=noninteractive apt-get -qqy install openjdk-17-jre openjdk-17-jre-headless libjna-java gnuplot > /dev/null 2>&1"`
167+
166168
// Install Jepsen's prereqs on the controller.
167169
if result, err := c.RunWithDetailsSingleNode(
168-
ctx, t.L(), option.WithNodes(controller), "sh", "-c",
169-
`"sudo DEBIAN_FRONTEND=noninteractive apt-get -qqy install openjdk-17-jre openjdk-17-jre-headless libjna-java gnuplot > /dev/null 2>&1"`,
170+
ctx, t.L(), option.WithNodes(controller), "sh", "-c", deps,
170171
); err != nil {
171172
if result.RemoteExitStatus == 100 {
172173
t.Skip("apt-get failure (#31944)", result.Stdout+result.Stderr)
@@ -211,7 +212,7 @@ type jepsenConfig struct {
211212
}
212213

213214
func makeJepsenConfig() jepsenConfig {
214-
if e := os.Getenv(envBuildJepsen); e != "" {
215+
if os.Getenv(envBuildJepsen) != "" {
215216
return jepsenConfig{
216217
buildFromSource: true,
217218
repoURL: jepsenRepo,
@@ -267,6 +268,8 @@ func (j jepsenConfig) startTest(
267268
ctx context.Context, t test.Test, run func(args ...string) error, testArgs string,
268269
) <-chan error {
269270
errCh := make(chan error, 1)
271+
var script string
272+
270273
if j.buildFromSource {
271274
// Install the jepsen package (into ~/.m2) before running tests in
272275
// the cockroach package. Clojure doesn't really understand
@@ -288,20 +291,28 @@ func (j jepsenConfig) startTest(
288291
}
289292
t.Fatalf("error installing Jepsen deps: %+v", err)
290293
}
291-
t.Go(func(context.Context, *logger.Logger) error {
292-
errCh <- run("bash", "-e", "-c", fmt.Sprintf(
293-
`"cd /mnt/data1/jepsen/cockroachdb && set -eo pipefail && ~/lein run %s > invoke.log 2>&1"`,
294-
testArgs))
295-
return nil
296-
})
294+
// N.B. jepsen exits with `255` if it encounters an unhandled exception.
295+
// (See https://github.com/cockroachdb/cockroach/issues/99681)
296+
// 255 is designated as SSH, hence we remap it to 254 below. (See errors.ClassifyCmdError)
297+
script = fmt.Sprintf(`"
298+
cd /mnt/data1/jepsen/cockroachdb &&
299+
set -eo pipefail &&
300+
rc=0; ~/lein run %s > invoke.log 2>&1 || rc=\$?;
301+
exit \$(( rc == 255 ? 254 : rc ))"`, testArgs)
297302
} else {
298-
t.Go(func(context.Context, *logger.Logger) error {
299-
errCh <- run("bash", "-e", "-c", fmt.Sprintf(
300-
`"cd /mnt/data1/jepsen/cockroachdb && set -eo pipefail && java -jar %s %s > invoke.log 2>&1"`,
301-
j.binaryName(), testArgs))
302-
return nil
303-
})
303+
// N.B. jepsen exits with `255` if it encounters an unhandled exception.
304+
// (See https://github.com/cockroachdb/cockroach/issues/99681)
305+
// 255 is designated as SSH, hence we remap it to 254 below. (See errors.ClassifyCmdError)
306+
script = fmt.Sprintf(`"
307+
cd /mnt/data1/jepsen/cockroachdb &&
308+
set -eo pipefail &&
309+
rc=0; java -jar %s %s > invoke.log 2>&1 || rc=\$?;
310+
exit \$(( rc == 255 ? 254 : rc ))"`, j.binaryName(), testArgs)
304311
}
312+
t.Go(func(context.Context, *logger.Logger) error {
313+
errCh <- run("bash", "-e", "-c", script)
314+
return nil
315+
})
305316
return errCh
306317
}
307318

pkg/roachprod/BUILD.bazel

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
2+
load("//pkg/testutils:buildutil/buildutil.bzl", "disallowed_imports_test")
23

34
go_library(
45
name = "roachprod",
@@ -14,7 +15,6 @@ go_library(
1415
"//pkg/build",
1516
"//pkg/cli/exit",
1617
"//pkg/cmd/roachprod/grafana",
17-
"//pkg/cmd/roachtest/test",
1818
"//pkg/roachprod/cloud",
1919
"//pkg/roachprod/config",
2020
"//pkg/roachprod/fluentbit",
@@ -58,3 +58,10 @@ go_test(
5858
"@com_github_stretchr_testify//require",
5959
],
6060
)
61+
62+
disallowed_imports_test(
63+
src = "roachprod",
64+
disallowed_prefixes = [
65+
"pkg/cmd/roachtest",
66+
],
67+
)

pkg/roachprod/roachprod.go

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
"github.com/cockroachdb/cockroach/pkg/build"
3232
"github.com/cockroachdb/cockroach/pkg/cli/exit"
3333
"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/grafana"
34-
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
3534
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
3635
"github.com/cockroachdb/cockroach/pkg/roachprod/config"
3736
"github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit"
@@ -70,6 +69,13 @@ func (e *MalformedClusterNameError) Error() string {
7069
return fmt.Sprintf("Malformed cluster name %s, %s. Did you mean one of %s", e.name, e.reason, e.suggestions)
7170
}
7271

72+
// DefaultCockroachPath is the path where the binary passed to the
73+
// `--cockroach` flag will be made available in every node in the
74+
// cluster.
75+
// N.B. This constant is duplicated from roachtest to break build dependency.
76+
// Thus, any update should be reflected in both places.
77+
const DefaultCockroachPath = "./cockroach"
78+
7379
// findActiveAccounts is a hook for tests to inject their own FindActiveAccounts
7480
// implementation. i.e. unit tests that don't want to actually access a provider.
7581
var findActiveAccounts = vm.FindActiveAccounts
@@ -3026,40 +3032,48 @@ func FetchLogs(
30263032
if err != nil {
30273033
return err
30283034
}
3029-
3035+
// Found logs and corresponding nodes, which contain at least one log directory.
30303036
logDirs := make(map[string]struct{})
3037+
nodes := install.Nodes{}
3038+
30313039
for _, r := range results {
30323040
if r.Err != nil {
30333041
l.Printf("will not fetch logs for n%d due to error: %v", r.Node, r.Err)
3042+
continue
30343043
}
3035-
3044+
nodes = append(nodes, r.Node)
30363045
for _, logDir := range strings.Fields(r.Stdout) {
30373046
logDirs[logDir] = struct{}{}
30383047
}
30393048
}
3040-
3049+
if len(logDirs) == 0 {
3050+
l.Printf("no log directories found")
3051+
return nil
3052+
}
3053+
// For each found log directory, recursively scp its contents under the destination, prefixed by node id.
3054+
// E.g., n1's contents will show up under 1.unredacted, etc.
30413055
for logDir := range logDirs {
30423056
dirPath := filepath.Join(destination, logDir, "unredacted")
30433057
if err := os.MkdirAll(filepath.Dir(dirPath), 0755); err != nil {
30443058
return err
30453059
}
3046-
3047-
if err := c.Get(ctx, l, c.Nodes, logDir /* src */, dirPath /* dest */); err != nil {
3060+
// Runs a recursive scp in parallel.
3061+
if err := c.Get(ctx, l, nodes, logDir /* src */, dirPath /* dest */); err != nil {
30483062
l.Printf("failed to fetch log directory %s: %v", logDir, err)
30493063
if ctx.Err() != nil {
30503064
return errors.Wrap(err, "cluster.FetchLogs")
30513065
}
30523066
}
30533067
}
3054-
3055-
if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(c.Nodes), "", fmt.Sprintf("mkdir -p logs/redacted && %s debug merge-logs --redact logs/*.log > logs/redacted/combined.log", test.DefaultCockroachPath)); err != nil {
3068+
// Create a single redacted log.
3069+
if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(nodes), "", fmt.Sprintf("mkdir -p logs/redacted && %s debug merge-logs --redact logs/*.log > logs/redacted/combined.log", DefaultCockroachPath)); err != nil {
30563070
l.Printf("failed to redact logs: %v", err)
30573071
if ctx.Err() != nil {
30583072
return err
30593073
}
30603074
}
30613075
dest := filepath.Join(destination, "logs/cockroach.log")
3062-
return errors.Wrap(c.Get(ctx, l, c.Nodes, "logs/redacted/combined.log" /* src */, dest), "cluster.FetchLogs")
3076+
return errors.Wrap(c.Get(ctx, l, nodes, "logs/redacted/combined.log" /* src */, dest), "cluster.FetchLogs")
30633077
})
30643078
}
30653079

0 commit comments

Comments
 (0)