@@ -10,15 +10,18 @@ import (
10
10
"os"
11
11
"path"
12
12
"strings"
13
+ "time"
13
14
14
15
"github.com/cockroachdb/cockroach/pkg/roachprod"
16
+ "github.com/cockroachdb/cockroach/pkg/roachprod/install"
15
17
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
18
+ "github.com/cockroachdb/cockroach/pkg/util/retry"
16
19
"github.com/cockroachdb/errors"
17
20
)
18
21
19
22
// stage copies the specified archive to the remote machine and extracts it to
20
23
// the specified directory (creating it if it does not exist).
21
- func stage (cluster , archivePath , remoteDest string ) (err error ) {
24
+ func stage (cluster , archivePath , remoteDest string , longRetries bool ) (err error ) {
22
25
ctx := context .Background ()
23
26
24
27
InitRoachprod ()
@@ -28,27 +31,41 @@ func stage(cluster, archivePath, remoteDest string) (err error) {
28
31
return
29
32
}
30
33
34
+ runOptions := install .DefaultRunOptions ()
35
+ if longRetries {
36
+ // For VMs that have started failing with transient errors (usually due to
37
+ // preemption), we want to introduce a longer retry period. These VMs may
38
+ // still be recoverable since they're part of a managed instance group
39
+ // that attempts to recover failed VMs.
40
+ runOptions = runOptions .WithRetryOpts (retry.Options {
41
+ InitialBackoff : 1 * time .Minute ,
42
+ MaxBackoff : 5 * time .Minute ,
43
+ Multiplier : 2 ,
44
+ MaxRetries : 10 ,
45
+ })
46
+ }
47
+
31
48
archiveName := path .Base (archivePath )
32
49
archiveRemotePath := path .Join ("/tmp" , archiveName )
33
50
34
51
defer func () {
35
52
// Remove the remote archive after we're done.
36
- cleanUpErr := RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath })
53
+ cleanUpErr := RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath }, runOptions )
37
54
err = errors .CombineErrors (err , errors .Wrapf (cleanUpErr , "removing remote archive: %s" , archiveRemotePath ))
38
55
}()
39
56
40
57
// Remove the remote archive and destination directory if they exist.
41
- if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath }); err != nil {
58
+ if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , archiveRemotePath }, runOptions ); err != nil {
42
59
return errors .Wrapf (err , "removing remote archive: %s" , archiveRemotePath )
43
60
}
44
- if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , remoteDest }); err != nil {
61
+ if err = RoachprodRun (cluster , l , []string {"rm" , "-rf" , remoteDest }, runOptions ); err != nil {
45
62
return errors .Wrapf (err , "removing remote destination: %s" , remoteDest )
46
63
}
47
64
48
65
// Copy the archive to the remote machine.
49
66
copyFromGCS := strings .HasPrefix (archivePath , "gs://" )
50
67
if copyFromGCS {
51
- if err = RoachprodRun (cluster , l , []string {"gsutil" , "-q" , "-m" , "cp" , archivePath , archiveRemotePath }); err != nil {
68
+ if err = RoachprodRun (cluster , l , []string {"gsutil" , "-q" , "-m" , "cp" , archivePath , archiveRemotePath }, runOptions ); err != nil {
52
69
return errors .Wrapf (err , "copying archive from GCS: %s" , archivePath )
53
70
}
54
71
} else {
@@ -58,10 +75,10 @@ func stage(cluster, archivePath, remoteDest string) (err error) {
58
75
}
59
76
60
77
// Extract the archive on the remote machine.
61
- if err = RoachprodRun (cluster , l , []string {"mkdir" , "-p" , remoteDest }); err != nil {
78
+ if err = RoachprodRun (cluster , l , []string {"mkdir" , "-p" , remoteDest }, runOptions ); err != nil {
62
79
return errors .Wrapf (err , "creating remote destination: %s" , remoteDest )
63
80
}
64
- if err = RoachprodRun (cluster , l , []string {"tar" , "-C" , remoteDest , "-xzf" , archiveRemotePath }); err != nil {
81
+ if err = RoachprodRun (cluster , l , []string {"tar" , "-C" , remoteDest , "-xzf" , archiveRemotePath }, runOptions ); err != nil {
65
82
return errors .Wrapf (err , "extracting archive: %s" , archiveRemotePath )
66
83
}
67
84
0 commit comments