Skip to content

Commit f0a3de3

Browse files
roachprod: add retries to Install
This change adds retries to `roachprod.Install` to mitigate test failures due to package installation failures. The webhook test in `cdc.go` is updated to use `Cluster.Install` to install `go`. The install command for `go` is just `sudo apt --yes install golang-go;` now. No previously existing uses of `Cluster.Install` used this command, so changing it should be safe. Informs: cockroachdb#103316 Informs: cockroachdb#107088 Closes: cockroachdb#71934 Release note: None Epic: None
1 parent 48f01bf commit f0a3de3

File tree

4 files changed

+38
-28
lines changed

4 files changed

+38
-28
lines changed

pkg/cmd/roachtest/tests/cdc.go

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,16 @@ type cdcTester struct {
9191
doneCh chan struct{}
9292
}
9393

94+
// The node on which the webhook sink will be installed and run on.
95+
func (ct *cdcTester) webhookSinkNode() option.NodeListOption {
96+
return ct.cluster.Node(ct.cluster.Spec().NodeCount)
97+
}
98+
99+
// The node on which the kafka sink will be installed and run on.
100+
func (ct *cdcTester) kafkaSinkNode() option.NodeListOption {
101+
return ct.cluster.Node(ct.cluster.Spec().NodeCount)
102+
}
103+
94104
// startStatsCollection sets the start point of the stats collection window
95105
// and returns a function which should be called at the end of the test to dump a
96106
// stats.json file to the artifacts directory.
@@ -158,7 +168,7 @@ func (ct *cdcTester) setupSink(args feedArgs) string {
158168
sinkURI = `experimental-gs://cockroach-tmp/roachtest/` + ts + "?AUTH=implicit"
159169
case webhookSink:
160170
ct.t.Status("webhook install")
161-
webhookNode := ct.cluster.Node(ct.cluster.Spec().NodeCount)
171+
webhookNode := ct.webhookSinkNode()
162172
rootFolder := `/home/ubuntu`
163173
nodeIPs, _ := ct.cluster.ExternalIP(ct.ctx, ct.logger, webhookNode)
164174

@@ -184,21 +194,6 @@ func (ct *cdcTester) setupSink(args feedArgs) string {
184194
ct.t.Fatal(err)
185195
}
186196

187-
// As seen in #107061, this can hit a 503 Service Unavailable when
188-
// trying to download the package, so we retry every 30 seconds
189-
// for up to 5 mins below.
190-
err = retry.WithMaxAttempts(ct.ctx, retry.Options{
191-
InitialBackoff: 30 * time.Second,
192-
Multiplier: 1,
193-
}, 10, func() error {
194-
err = ct.cluster.RunE(ct.ctx, webhookNode, `sudo apt --yes install golang-go;`)
195-
err = errors.Wrap(err, "infrastructure failure; could not install golang")
196-
return err
197-
})
198-
if err != nil {
199-
ct.t.Skip(err)
200-
}
201-
202197
// Start the server in its own monitor to not block ct.mon.Wait()
203198
serverExecCmd := fmt.Sprintf(`go run webhook-server-%d.go`, webhookPort)
204199
m := ct.cluster.NewMonitor(ct.ctx, ct.workloadNode)
@@ -219,7 +214,7 @@ func (ct *cdcTester) setupSink(args feedArgs) string {
219214
case pubsubSink:
220215
sinkURI = changefeedccl.GcpScheme + `://cockroach-ephemeral` + "?AUTH=implicit&topic_name=pubsubSink-roachtest&region=us-east1"
221216
case kafkaSink:
222-
kafkaNode := ct.cluster.Node(ct.cluster.Spec().NodeCount)
217+
kafkaNode := ct.kafkaSinkNode()
223218
kafka := kafkaManager{
224219
t: ct.t,
225220
c: ct.cluster,
@@ -1303,6 +1298,13 @@ func registerCDC(r registry.Registry) {
13031298
ct := newCDCTester(ctx, t, c)
13041299
defer ct.Close()
13051300

1301+
// Consider an installation failure to be a flake which is out of
1302+
// our control. This should be rare.
1303+
err := c.Install(ctx, t.L(), ct.webhookSinkNode(), "go")
1304+
if err != nil {
1305+
t.Skip(err)
1306+
}
1307+
13061308
ct.runTPCCWorkload(tpccArgs{warehouses: 100, duration: "30m"})
13071309

13081310
// The deprecated webhook sink is unable to handle the throughput required for 100 warehouses
@@ -1351,7 +1353,7 @@ func registerCDC(r registry.Registry) {
13511353

13521354
ct.runTPCCWorkload(tpccArgs{warehouses: 1})
13531355

1354-
kafkaNode := ct.cluster.Node(ct.cluster.Spec().NodeCount)
1356+
kafkaNode := ct.kafkaSinkNode()
13551357
kafka := kafkaManager{
13561358
t: ct.t,
13571359
c: ct.cluster,

pkg/roachprod/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ go_library(
2525
"//pkg/server/debug/replay",
2626
"//pkg/util/ctxgroup",
2727
"//pkg/util/httputil",
28+
"//pkg/util/retry",
2829
"//pkg/util/syncutil",
2930
"//pkg/util/timeutil",
3031
"@com_github_cockroachdb_errors//:errors",

pkg/roachprod/install/install.go

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,7 @@ sudo apt-get update;
102102
sudo apt-get install -y gcc;
103103
`,
104104

105-
// graphviz and rlwrap are useful for pprof
106-
"go": `
107-
sudo apt-get update;
108-
sudo apt-get install -y graphviz rlwrap;
109-
110-
curl https://dl.google.com/go/go1.12.linux-amd64.tar.gz | sudo tar -C /usr/local -xz;
111-
echo 'export PATH=$PATH:/usr/local/go/bin' | sudo tee /etc/profile.d/go.sh > /dev/null;
112-
sudo chmod +x /etc/profile.d/go.sh;
113-
`,
105+
"go": `sudo apt --yes install golang-go;`,
114106

115107
"haproxy": `
116108
sudo apt-get update;

pkg/roachprod/roachprod.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/cockroachdb/cockroach/pkg/server/debug/replay"
4646
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
4747
"github.com/cockroachdb/cockroach/pkg/util/httputil"
48+
"github.com/cockroachdb/cockroach/pkg/util/retry"
4849
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
4950
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
5051
"github.com/cockroachdb/errors"
@@ -833,7 +834,21 @@ func Install(ctx context.Context, l *logger.Logger, clusterName string, software
833834
if err != nil {
834835
return err
835836
}
836-
return install.Install(ctx, l, c, software)
837+
838+
// As seen in #103316, this can hit a 503 Service Unavailable when
839+
// trying to download the package, so we retry every 30 seconds
840+
// for up to 5 mins below. The caller may choose to fail or skip the test.
841+
return retry.WithMaxAttempts(ctx, retry.Options{
842+
InitialBackoff: 30 * time.Second,
843+
Multiplier: 1,
844+
}, 10, func() error {
845+
err := install.Install(ctx, l, c, software)
846+
err = errors.Wrapf(err, "retryable infrastructure error: could not install %s", software)
847+
if err != nil {
848+
l.Printf(err.Error())
849+
}
850+
return err
851+
})
837852
}
838853

839854
// Download downloads 3rd party tools, using a GCS cache if possible.

0 commit comments

Comments
 (0)