Skip to content

Commit 24cce8b

Browse files
craig[bot]herkolategan
andcommitted
Merge #154383
154383: roachprod: auto restart option for cockroach processes r=srosenberg a=herkolategan Previously, if a cockroach process died, it needed to be manually restarted. This is expected in most circumstances. But this behavior might not always be practical, for instance, when running a long unattended benchmark. To support this, a new start option has been added that automatically restarts the service if a process fails (e.g., due to a disk stall). This ensures the process comes back online quickly without manual intervention. Epic: None Release note: None Co-authored-by: Herko Lategan <[email protected]>
2 parents e1d6122 + 13cbb1c commit 24cce8b

File tree

4 files changed

+15
-0
lines changed

4 files changed

+15
-0
lines changed

pkg/cmd/roachprod/cli/flags.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,8 @@ func initFlagsStartOpsForCmd(cmd *cobra.Command) {
453453
cmd.Flags().BoolVar(&startOpts.EnableFluentSink,
454454
"enable-fluent-sink", startOpts.EnableFluentSink,
455455
"whether to enable the fluent-servers attribute in the CockroachDB logging configuration")
456+
cmd.Flags().BoolVar(&startOpts.AutoRestart,
457+
"auto-restart", startOpts.AutoRestart, "automatically restart cockroach processes that die")
456458
}
457459

458460
func initFlagInsecureIgnoreHostKeyForCmd(cmd *cobra.Command) {

pkg/roachprod/install/cockroach.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ type StartOpts struct {
152152
// initialization and sequential node starts and also reuses the previous start script.
153153
IsRestart bool
154154

155+
// AutoRestart enables automatically restarting a process if it died.
156+
AutoRestart bool
157+
155158
// EnableFluentSink determines whether to enable the fluent-servers attribute
156159
// in the CockroachDB logging configuration.
157160
EnableFluentSink bool
@@ -1004,6 +1007,7 @@ func (c *SyncedCluster) generateStartCmd(
10041007
NumFilesLimit: startOpts.NumFilesLimit,
10051008
VirtualClusterLabel: VirtualClusterLabel(startOpts.VirtualClusterName, startOpts.SQLInstance),
10061009
Local: c.IsLocal(),
1010+
AutoRestart: startOpts.AutoRestart,
10071011
})
10081012
}
10091013

@@ -1017,6 +1021,7 @@ type startTemplateData struct {
10171021
VirtualClusterLabel string
10181022
Args []string
10191023
EnvVars []string
1024+
AutoRestart bool
10201025
}
10211026

10221027
type loggingTemplateData struct {

pkg/roachprod/install/scripts/start.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ KEY_CMD=#{.KeyCmd#}
1616
MEMORY_MAX=#{.MemoryMax#}
1717
NUM_FILES_LIMIT=#{.NumFilesLimit#}
1818
VIRTUAL_CLUSTER_LABEL=#{.VirtualClusterLabel#}
19+
#{if .AutoRestart#}
20+
AUTO_RESTART=1
21+
#{end#}
22+
1923
ARGS=(
2024
#{range .Args -#}
2125
#{shesc .#}
@@ -99,4 +103,5 @@ sudo systemd-run --unit "${VIRTUAL_CLUSTER_LABEL}" \
99103
-p "MemoryMax=${MEMORY_MAX}" \
100104
-p LimitCORE=infinity \
101105
-p "LimitNOFILE=${NUM_FILES_LIMIT}" \
106+
${AUTO_RESTART:+-p Restart=always -p RestartSec=5s -p StartLimitIntervalSec=60s -p StartLimitBurst=3} \
102107
bash "${0}" run

pkg/roachprod/install/testdata/start/start.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ echo bar $HOME
2020
MEMORY_MAX=81%
2121
NUM_FILES_LIMIT=0
2222
VIRTUAL_CLUSTER_LABEL=cockroach-system
23+
24+
2325
ARGS=(
2426
start
2527
--log
@@ -103,6 +105,7 @@ sudo systemd-run --unit "${VIRTUAL_CLUSTER_LABEL}" \
103105
-p "MemoryMax=${MEMORY_MAX}" \
104106
-p LimitCORE=infinity \
105107
-p "LimitNOFILE=${NUM_FILES_LIMIT}" \
108+
${AUTO_RESTART:+-p Restart=always -p RestartSec=5s -p StartLimitIntervalSec=60s -p StartLimitBurst=3} \
106109
bash "${0}" run
107110
----
108111
----

0 commit comments

Comments
 (0)