Skip to content

Commit fe5ec41

Browse files
craig[bot]herkolategan
andcommitted
Merge #153167
153167: roachprod-microbench: support long retries when staging r=golgeek,DarrylWong a=herkolategan Previously, sometimes while trying to stage the benchmark binaries the VMs might get preempted. But since this is a managed group and the VMs will come online again, we should retry for longer than the standard retry options. This change adds support for longer retries when staging benchmark binaries. It is enabled by default since this binary is mostly used in the weekly job. Epic: None Release note: None Co-authored-by: Herko Lategan <[email protected]>
2 parents 54f4373 + 2a5afcd commit fe5ec41

File tree

3 files changed

+31
-10
lines changed

3 files changed

+31
-10
lines changed

pkg/cmd/roachprod-microbench/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,12 @@ func makeRunCommand() *cobra.Command {
121121
}
122122

123123
func makeStageCommand() *cobra.Command {
124+
longRetries := true
124125
runCmdFunc := func(cmd *cobra.Command, args []string) error {
125126
cluster := args[0]
126127
src := args[1]
127128
dest := args[2]
128-
return stage(cluster, src, dest)
129+
return stage(cluster, src, dest, longRetries)
129130
}
130131

131132
cmd := &cobra.Command{
@@ -139,6 +140,7 @@ The source can be a local path or a GCS URI.`,
139140
RunE: runCmdFunc,
140141
}
141142
cmd.Flags().BoolVar(&roachprodConfig.Quiet, "quiet", roachprodConfig.Quiet, "suppress roachprod progress output")
143+
cmd.Flags().BoolVar(&longRetries, "long-retries", longRetries, "use longer retry intervals to handle transient errors (for use with recoverable managed instances)")
142144
return cmd
143145
}
144146

pkg/cmd/roachprod-microbench/roachprod_util.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@ func InitRoachprod() {
2323

2424
// RoachprodRun runs a command on a roachprod cluster with the given cluster name and logger.
2525
// It takes a list of command arguments and passes them to the roachprod command execution.
26-
func RoachprodRun(clusterName string, l *logger.Logger, cmdArray []string) error {
26+
func RoachprodRun(
27+
clusterName string, l *logger.Logger, cmdArray []string, runOptions install.RunOptions,
28+
) error {
2729
// Execute the roachprod command with the provided context, logger, cluster name, and options.
2830
return roachprod.Run(
2931
context.Background(), l, clusterName, "", "", install.SimpleSecureOption(false),
30-
os.Stdout, os.Stderr, cmdArray, install.DefaultRunOptions(),
32+
os.Stdout, os.Stderr, cmdArray, runOptions,
3133
)
3234
}
3335

pkg/cmd/roachprod-microbench/stage.go

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,18 @@ import (
1010
"os"
1111
"path"
1212
"strings"
13+
"time"
1314

1415
"github.com/cockroachdb/cockroach/pkg/roachprod"
16+
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
1517
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
18+
"github.com/cockroachdb/cockroach/pkg/util/retry"
1619
"github.com/cockroachdb/errors"
1720
)
1821

1922
// stage copies the specified archive to the remote machine and extracts it to
2023
// the specified directory (creating it if it does not exist).
21-
func stage(cluster, archivePath, remoteDest string) (err error) {
24+
func stage(cluster, archivePath, remoteDest string, longRetries bool) (err error) {
2225
ctx := context.Background()
2326

2427
InitRoachprod()
@@ -28,27 +31,41 @@ func stage(cluster, archivePath, remoteDest string) (err error) {
2831
return
2932
}
3033

34+
runOptions := install.DefaultRunOptions()
35+
if longRetries {
36+
// For VMs that have started failing with transient errors (usually due to
37+
// preemption), we want to introduce a longer retry period. These VMs may
38+
// still be recoverable since they're part of a managed instance group
39+
// that attempts to recover failed VMs.
40+
runOptions = runOptions.WithRetryOpts(retry.Options{
41+
InitialBackoff: 1 * time.Minute,
42+
MaxBackoff: 5 * time.Minute,
43+
Multiplier: 2,
44+
MaxRetries: 10,
45+
})
46+
}
47+
3148
archiveName := path.Base(archivePath)
3249
archiveRemotePath := path.Join("/tmp", archiveName)
3350

3451
defer func() {
3552
// Remove the remote archive after we're done.
36-
cleanUpErr := RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath})
53+
cleanUpErr := RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath}, runOptions)
3754
err = errors.CombineErrors(err, errors.Wrapf(cleanUpErr, "removing remote archive: %s", archiveRemotePath))
3855
}()
3956

4057
// Remove the remote archive and destination directory if they exist.
41-
if err = RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath}); err != nil {
58+
if err = RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath}, runOptions); err != nil {
4259
return errors.Wrapf(err, "removing remote archive: %s", archiveRemotePath)
4360
}
44-
if err = RoachprodRun(cluster, l, []string{"rm", "-rf", remoteDest}); err != nil {
61+
if err = RoachprodRun(cluster, l, []string{"rm", "-rf", remoteDest}, runOptions); err != nil {
4562
return errors.Wrapf(err, "removing remote destination: %s", remoteDest)
4663
}
4764

4865
// Copy the archive to the remote machine.
4966
copyFromGCS := strings.HasPrefix(archivePath, "gs://")
5067
if copyFromGCS {
51-
if err = RoachprodRun(cluster, l, []string{"gsutil", "-q", "-m", "cp", archivePath, archiveRemotePath}); err != nil {
68+
if err = RoachprodRun(cluster, l, []string{"gsutil", "-q", "-m", "cp", archivePath, archiveRemotePath}, runOptions); err != nil {
5269
return errors.Wrapf(err, "copying archive from GCS: %s", archivePath)
5370
}
5471
} else {
@@ -58,10 +75,10 @@ func stage(cluster, archivePath, remoteDest string) (err error) {
5875
}
5976

6077
// Extract the archive on the remote machine.
61-
if err = RoachprodRun(cluster, l, []string{"mkdir", "-p", remoteDest}); err != nil {
78+
if err = RoachprodRun(cluster, l, []string{"mkdir", "-p", remoteDest}, runOptions); err != nil {
6279
return errors.Wrapf(err, "creating remote destination: %s", remoteDest)
6380
}
64-
if err = RoachprodRun(cluster, l, []string{"tar", "-C", remoteDest, "-xzf", archiveRemotePath}); err != nil {
81+
if err = RoachprodRun(cluster, l, []string{"tar", "-C", remoteDest, "-xzf", archiveRemotePath}, runOptions); err != nil {
6582
return errors.Wrapf(err, "extracting archive: %s", archiveRemotePath)
6683
}
6784

0 commit comments

Comments
 (0)