@@ -58,6 +58,9 @@ type onlineRestoreSpecs struct {
58
58
linkPhaseTimeout time.Duration
59
59
// downloadPhaseTimeout is the timeout for the download phase of the restore, if set.
60
60
downloadPhaseTimeout time.Duration
61
+ // compactionConcurrency overrides the default
62
+ // storage.max_download_compaction_concurrency cluster setting.
63
+ compactionConcurrency int
61
64
}
62
65
63
66
// restoreWorkload describes the workload that will run during the download
@@ -148,24 +151,67 @@ func registerOnlineRestorePerf(r registry.Registry) {
148
151
linkPhaseTimeout : 45 * time .Second , // typically takes 20 seconds
149
152
downloadPhaseTimeout : 20 * time .Minute , // typically takes 10 minutes.
150
153
},
154
+ // OR Benchmarking tests
155
+ // See benchmark plan here: https://docs.google.com/spreadsheets/d/1uPcQ1YPohXKxwFxWWDUMJrYLKQOuqSZKVrI8SJam5n8
151
156
{
152
- // 2TB tpcc Online Restore
153
157
restoreSpecs : restoreSpecs {
154
- hardware : makeHardwareSpecs (hardwareSpecs {nodes : 10 , volumeSize : 1500 , workloadNode : true }),
158
+ hardware : makeHardwareSpecs (hardwareSpecs {
159
+ nodes : 10 , volumeSize : 1500 , workloadNode : true ,
160
+ }),
155
161
backup : backupSpecs {
156
162
cloud : spec .GCE ,
157
163
fixture : MediumFixture ,
158
164
},
159
- fullBackupOnly : true ,
160
165
timeout : 3 * time .Hour ,
161
166
suites : registry .Suites (registry .Nightly ),
167
+ fullBackupOnly : true ,
162
168
},
163
169
workload : tpccRestore {
164
170
opts : tpccRunOpts {waitFraction : 0 , workers : 100 , maxRate : 1000 },
165
171
},
166
172
linkPhaseTimeout : 10 * time .Minute , // typically takes 5 minutes
167
173
downloadPhaseTimeout : 4 * time .Hour , // typically takes 2 hours.
168
174
},
175
+ {
176
+ restoreSpecs : restoreSpecs {
177
+ hardware : makeHardwareSpecs (hardwareSpecs {
178
+ nodes : 10 , volumeSize : 1500 , workloadNode : true ,
179
+ }),
180
+ backup : backupSpecs {
181
+ cloud : spec .GCE ,
182
+ fixture : MediumFixture ,
183
+ },
184
+ timeout : 3 * time .Hour ,
185
+ suites : registry .Suites (registry .Nightly ),
186
+ fullBackupOnly : true ,
187
+ },
188
+ workload : tpccRestore {
189
+ opts : tpccRunOpts {waitFraction : 0 , workers : 100 , maxRate : 1000 },
190
+ },
191
+ linkPhaseTimeout : 10 * time .Minute ,
192
+ downloadPhaseTimeout : 4 * time .Hour ,
193
+ compactionConcurrency : 32 ,
194
+ },
195
+ {
196
+ restoreSpecs : restoreSpecs {
197
+ hardware : makeHardwareSpecs (hardwareSpecs {
198
+ nodes : 10 , volumeSize : 1500 , workloadNode : true , ebsIOPS : 15_000 , ebsThroughput : 800 ,
199
+ }),
200
+ backup : backupSpecs {
201
+ cloud : spec .AWS ,
202
+ fixture : MediumFixture ,
203
+ },
204
+ timeout : 3 * time .Hour ,
205
+ suites : registry .Suites (registry .Nightly ),
206
+ fullBackupOnly : true ,
207
+ },
208
+ workload : tpccRestore {
209
+ opts : tpccRunOpts {waitFraction : 0 , workers : 100 , maxRate : 1000 },
210
+ },
211
+ linkPhaseTimeout : 10 * time .Minute ,
212
+ downloadPhaseTimeout : 4 * time .Hour ,
213
+ compactionConcurrency : 32 ,
214
+ },
169
215
} {
170
216
for _ , runOnline := range []bool {true , false } {
171
217
for _ , useWorkarounds := range []bool {true , false } {
@@ -174,6 +220,26 @@ func registerOnlineRestorePerf(r registry.Registry) {
174
220
runOnline := runOnline
175
221
runWorkload := runWorkload
176
222
useWorkarounds := useWorkarounds
223
+ clusterSettings := []string {
224
+ // TODO(dt): what's the right value for this? How do we tune this
225
+ // on the fly automatically during the restore instead of by-hand?
226
+ // Context: We expect many operations to take longer than usual
227
+ // when some or all of the data they touch is remote. For now this
228
+ // is being blanket set to 1h manually, and a user's run-book
229
+ // would need to do this by hand before an online restore and
230
+ // reset it manually after, but ideally the queues would be aware
231
+ // of remote-ness when they pick their own timeouts and pick
232
+ // accordingly.
233
+ "kv.queue.process.guaranteed_time_budget='1h'" ,
234
+ // TODO(dt): AC appears periodically reduce the workload to 0 QPS
235
+ // during the download phase (sudden jumps from 0 to 2k qps to 0).
236
+ // Disable for now until we figure out how to smooth this out.
237
+ "admission.disk_bandwidth_tokens.elastic.enabled=false" ,
238
+ "admission.kv.enabled=false" ,
239
+ "admission.sql_kv_response.enabled=false" ,
240
+ "kv.consistency_queue.enabled=false" ,
241
+ "kv.range_merge.skip_external_bytes.enabled=true" ,
242
+ }
177
243
178
244
if runOnline {
179
245
sp .namePrefix = "online/"
@@ -187,10 +253,24 @@ func registerOnlineRestorePerf(r registry.Registry) {
187
253
188
254
sp .namePrefix = sp .namePrefix + fmt .Sprintf ("workload=%t" , runWorkload )
189
255
if ! useWorkarounds {
256
+ clusterSettings = []string {}
190
257
sp .skip = "used for ad hoc experiments"
191
258
sp .namePrefix = sp .namePrefix + fmt .Sprintf ("/workarounds=%t" , useWorkarounds )
192
259
}
193
260
261
+ if sp .compactionConcurrency != 0 {
262
+ sp .namePrefix = sp .namePrefix + fmt .Sprintf (
263
+ "/compaction-concurrency=%d" , sp .compactionConcurrency ,
264
+ )
265
+ clusterSettings = append (
266
+ clusterSettings ,
267
+ fmt .Sprintf (
268
+ "storage.max_download_compaction_concurrency=%d" , sp .compactionConcurrency ,
269
+ ),
270
+ )
271
+ sp .skip = "used for ad hoc experiments"
272
+ }
273
+
194
274
if sp .skip == "" && ! backuptestutils .IsOnlineRestoreSupported () {
195
275
sp .skip = "online restore is only tested on development branch"
196
276
}
@@ -215,7 +295,9 @@ func registerOnlineRestorePerf(r registry.Registry) {
215
295
rd := makeRestoreDriver (t , c , sp .restoreSpecs )
216
296
rd .prepareCluster (ctx )
217
297
218
- restoreStats := runRestore (ctx , t , c , sp , rd , runOnline , runWorkload , useWorkarounds )
298
+ restoreStats := runRestore (
299
+ ctx , t , c , sp , rd , runOnline , runWorkload , clusterSettings ... ,
300
+ )
219
301
if runOnline {
220
302
require .NoError (t , postRestoreValidation (
221
303
ctx ,
@@ -304,10 +386,7 @@ func registerOnlineRestoreCorrectness(r registry.Registry) {
304
386
rd := makeRestoreDriver (t , c , sp .restoreSpecs )
305
387
rd .prepareCluster (ctx )
306
388
307
- runRestore (
308
- ctx , t , c , regRestoreSpecs , rd ,
309
- false /* runOnline */ , true /* runWorkload */ , false , /* useWorkarounds */
310
- )
389
+ runRestore (ctx , t , c , regRestoreSpecs , rd , false /* runOnline */ , true /* runWorkload */ )
311
390
details , err := c .RunWithDetails (
312
391
ctx ,
313
392
t .L (),
@@ -320,10 +399,7 @@ func registerOnlineRestoreCorrectness(r registry.Registry) {
320
399
c .Wipe (ctx )
321
400
rd .prepareCluster (ctx )
322
401
323
- runRestore (
324
- ctx , t , c , orSpecs , rd ,
325
- true /* runOnline */ , true /* runWorkload */ , false , /* useWorkarounds */
326
- )
402
+ runRestore (ctx , t , c , orSpecs , rd , true /* runOnline */ , true /* runWorkload */ )
327
403
details , err = c .RunWithDetails (
328
404
ctx ,
329
405
t .L (),
@@ -577,13 +653,24 @@ type restoreStats struct {
577
653
workloadEndTime time.Time
578
654
}
579
655
656
+ // runRestore runs restore based on the provided specs.
657
+ //
658
+ // If runOnline is set, online restore is run, otherwise a conventional restore
659
+ // is run.
660
+ //
661
+ // If runWorkload is set, the workload is run during the download phase of the
662
+ // restore.
663
+ //
664
+ // clusterSettings is a list of key=value pairs of cluster settings to set
665
+ // before performing the restore.
580
666
func runRestore (
581
667
ctx context.Context ,
582
668
t test.Test ,
583
669
c cluster.Cluster ,
584
670
sp onlineRestoreSpecs ,
585
671
rd restoreDriver ,
586
- runOnline , runWorkload , useWorkarounds bool ,
672
+ runOnline , runWorkload bool ,
673
+ clusterSettings ... string ,
587
674
) restoreStats {
588
675
testStartTime := timeutil .Now ()
589
676
@@ -598,36 +685,9 @@ func runRestore(
598
685
return err
599
686
}
600
687
defer db .Close ()
601
- if useWorkarounds {
602
- // TODO(dt): what's the right value for this? How do we tune this
603
- // on the fly automatically during the restore instead of by-hand?
604
- // Context: We expect many operations to take longer than usual
605
- // when some or all of the data they touch is remote. For now this
606
- // is being blanket set to 1h manually, and a user's run-book
607
- // would need to do this by hand before an online restore and
608
- // reset it manually after, but ideally the queues would be aware
609
- // of remote-ness when they pick their own timeouts and pick
610
- // accordingly.
611
- if _ , err := db .Exec ("SET CLUSTER SETTING kv.queue.process.guaranteed_time_budget='1h'" ); err != nil {
612
- return err
613
- }
614
- // TODO(dt): AC appears periodically reduce the workload to 0 QPS
615
- // during the download phase (sudden jumps from 0 to 2k qps to 0).
616
- // Disable for now until we figure out how to smooth this out.
617
- if _ , err := db .Exec ("SET CLUSTER SETTING admission.disk_bandwidth_tokens.elastic.enabled=false" ); err != nil {
618
- return err
619
- }
620
- if _ , err := db .Exec ("SET CLUSTER SETTING admission.kv.enabled=false" ); err != nil {
621
- return err
622
- }
623
- if _ , err := db .Exec ("SET CLUSTER SETTING admission.sql_kv_response.enabled=false" ); err != nil {
624
- return err
625
- }
626
- if _ , err := db .Exec ("SET CLUSTER SETTING kv.consistency_queue.enabled=false" ); err != nil {
627
- return err
628
- }
629
- if _ , err := db .Exec ("SET CLUSTER SETTING kv.range_merge.skip_external_bytes.enabled=true" ); err != nil {
630
- return err
688
+ for _ , setting := range clusterSettings {
689
+ if _ , err := db .Exec (fmt .Sprintf ("SET CLUSTER SETTING %s" , setting )); err != nil {
690
+ return errors .Wrapf (err , "failed to set cluster setting %s" , setting )
631
691
}
632
692
}
633
693
opts := "WITH UNSAFE_RESTORE_INCOMPATIBLE_VERSION"
0 commit comments