Skip to content

Commit 922c9f7

Browse files
committed
restore: scale number of OR link workers to number of nodes
This patch teaches online restore to scale the number of link workers to the number of nodes in the cluster instead of statically setting it to 32. In the new behavior, the default value for `backup.restore.online_worker_count` is 0, which sets the number of link workers to two times the number of nodes. Epic: CRDB-48786 Fixes: #146584
1 parent ac39c2e commit 922c9f7

File tree

1 file changed

+31
-6
lines changed

1 file changed

+31
-6
lines changed

pkg/backup/restore_online.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,14 @@ import (
4646
"github.com/cockroachdb/errors"
4747
)
4848

49-
var onlineRestoreLinkWorkers = settings.RegisterByteSizeSetting(
49+
const defaultLinkWorkersPerNode = 2
50+
51+
var onlineRestoreLinkWorkers = settings.RegisterIntSetting(
5052
settings.ApplicationLevel,
5153
"backup.restore.online_worker_count",
52-
"workers to use for online restore link phase",
53-
32,
54-
settings.PositiveInt,
54+
"total workers to use for online restore link phase (defaults to 2x number of nodes if set to 0)",
55+
0,
56+
settings.NonNegativeInt,
5557
)
5658

5759
var onlineRestoreLayerLimit = settings.RegisterIntSetting(
@@ -83,7 +85,10 @@ func splitAndScatter(
8385

8486
log.Dev.Infof(ctx, "splitting and scattering spans")
8587

86-
workers := int(onlineRestoreLinkWorkers.Get(&execCtx.ExecCfg().Settings.SV))
88+
workers, err := getNumOnlineRestoreLinkWorkers(ctx, execCtx)
89+
if err != nil {
90+
return err
91+
}
8792
toScatter := make(chan execinfrapb.RestoreSpanEntry, 1)
8893
toSplit := make(chan execinfrapb.RestoreSpanEntry, workers)
8994

@@ -270,7 +275,10 @@ func linkExternalFiles(
270275

271276
log.Dev.Infof(ctx, "ingesting remote files")
272277

273-
workers := int(onlineRestoreLinkWorkers.Get(&execCtx.ExecCfg().Settings.SV))
278+
workers, err := getNumOnlineRestoreLinkWorkers(ctx, execCtx)
279+
if err != nil {
280+
return 0, 0, err
281+
}
274282

275283
grp := ctxgroup.WithContext(ctx)
276284
ch := make(chan execinfrapb.RestoreSpanEntry, workers)
@@ -1039,3 +1047,20 @@ func (r *restoreResumer) maybeCleanupFailedOnlineRestore(
10391047

10401048
return unstickRestoreSpans(ctx, p.ExecCfg(), details.DownloadSpans)
10411049
}
1050+
1051+
// getNumOnlineRestoreLinkWorkers returns the total number of workers to use for
1052+
// the link phase of an online restore.
1053+
func getNumOnlineRestoreLinkWorkers(ctx context.Context, execCtx sql.JobExecContext) (int, error) {
1054+
if workers := onlineRestoreLinkWorkers.Get(&execCtx.ExecCfg().Settings.SV); workers > 0 {
1055+
return int(workers), nil
1056+
}
1057+
// All nodes are used in a restore
1058+
_, sqlInstanceIDs, err := execCtx.ExecCfg().DistSQLPlanner.SetupAllNodesPlanning(
1059+
ctx, execCtx.ExtendedEvalContext(), execCtx.ExecCfg(),
1060+
)
1061+
if err != nil {
1062+
return 0, err
1063+
}
1064+
numNodes := min(len(sqlInstanceIDs), 1)
1065+
return defaultLinkWorkersPerNode * numNodes, nil
1066+
}

0 commit comments

Comments
 (0)