roachtest: add benchmark roachtests for online restore

kev-cao · kev-cao · commit 78e265e42a69 · 2025-06-12T14:02:35.000-04:00
Epic: CRDB-37550 Release note: None
diff --git a/pkg/cmd/roachtest/spec/cluster_spec.go b/pkg/cmd/roachtest/spec/cluster_spec.go
@@ -138,7 +138,9 @@ type ClusterSpec struct {
 		MachineType string
 		// VolumeThroughput is the min provisioned EBS volume throughput.
 		VolumeThroughput int
-		Zones            string
+		// VolumeIOPS is the provisioned EBS volume IOPS.
+		VolumeIOPS int
+		Zones      string
 	} `cloud:"aws"`
 
 	// Azure-specific arguments. These values apply only on clusters instantiated on Azure.
@@ -220,17 +222,17 @@ func awsMachineSupportsSSD(machineType string) bool {
 }
 
 func getAWSOpts(
-	machineType string, volumeSize, ebsThroughput int, localSSD bool, useSpotVMs bool,
+	machineType string, volumeSize, ebsThroughput int, ebsIOPS int, localSSD bool, useSpotVMs bool,
 ) vm.ProviderOpts {
 	opts := aws.DefaultProviderOpts()
 	if volumeSize != 0 {
 		opts.DefaultEBSVolume.Disk.VolumeSize = volumeSize
 	}
+	if ebsIOPS != 0 {
+		opts.DefaultEBSVolume.Disk.IOPs = ebsIOPS
+	}
 	if ebsThroughput != 0 {
 		opts.DefaultEBSVolume.Disk.Throughput = ebsThroughput
-		if opts.DefaultEBSVolume.Disk.IOPs < opts.DefaultEBSVolume.Disk.Throughput*4 {
-			opts.DefaultEBSVolume.Disk.IOPs = opts.DefaultEBSVolume.Disk.Throughput * 6
-		}
 	}
 	if localSSD {
 		opts.SSDMachineType = machineType
@@ -518,10 +520,10 @@ func (s *ClusterSpec) RoachprodOpts(
 	var workloadProviderOpts vm.ProviderOpts
 	switch cloud {
 	case AWS:
-		providerOpts = getAWSOpts(machineType, s.VolumeSize, s.AWS.VolumeThroughput,
+		providerOpts = getAWSOpts(machineType, s.VolumeSize, s.AWS.VolumeThroughput, s.AWS.VolumeIOPS,
 			createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
 		workloadProviderOpts = getAWSOpts(workloadMachineType, s.VolumeSize, s.AWS.VolumeThroughput,
-			createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
+			s.AWS.VolumeIOPS, createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
 	case GCE:
 		providerOpts = getGCEOpts(machineType, s.VolumeSize, ssdCount,
 			createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
diff --git a/pkg/cmd/roachtest/spec/option.go b/pkg/cmd/roachtest/spec/option.go
@@ -282,6 +282,14 @@ func AWSVolumeThroughput(throughput int) Option {
 	}
 }
 
+// AWSVolumeIOPS sets the provisioned IOPS for EBS volumes when the cluster is
+// on AWS.
+func AWSVolumeIOPS(iops int) Option {
+	return func(spec *ClusterSpec) {
+		spec.AWS.VolumeIOPS = iops
+	}
+}
+
 // AWSZones is a node option which requests Geo-distributed nodes; only applies
 // when the test runs on AWS.
 //
diff --git a/pkg/cmd/roachtest/tests/online_restore.go b/pkg/cmd/roachtest/tests/online_restore.go
@@ -58,6 +58,9 @@ type onlineRestoreSpecs struct {
 	linkPhaseTimeout time.Duration
 	// downloadPhaseTimeout is the timeout for the download phase of the restore, if set.
 	downloadPhaseTimeout time.Duration
+	// compactionConcurrency overrides the default
+	// storage.max_download_compaction_concurrency cluster setting.
+	compactionConcurrency int
 }
 
 // restoreWorkload describes the workload that will run during the download
@@ -148,24 +151,67 @@ func registerOnlineRestorePerf(r registry.Registry) {
 			linkPhaseTimeout:     45 * time.Second, // typically takes 20 seconds
 			downloadPhaseTimeout: 20 * time.Minute, // typically takes 10 minutes.
 		},
+		// OR Benchmarking tests
+		// See benchmark plan here: https://docs.google.com/spreadsheets/d/1uPcQ1YPohXKxwFxWWDUMJrYLKQOuqSZKVrI8SJam5n8
 		{
-			// 2TB tpcc Online Restore
 			restoreSpecs: restoreSpecs{
-				hardware: makeHardwareSpecs(hardwareSpecs{nodes: 10, volumeSize: 1500, workloadNode: true}),
+				hardware: makeHardwareSpecs(hardwareSpecs{
+					nodes: 10, volumeSize: 1500, workloadNode: true,
+				}),
 				backup: backupSpecs{
 					cloud:   spec.GCE,
 					fixture: MediumFixture,
 				},
-				fullBackupOnly: true,
 				timeout:        3 * time.Hour,
 				suites:         registry.Suites(registry.Nightly),
+				fullBackupOnly: true,
 			},
 			workload: tpccRestore{
 				opts: tpccRunOpts{waitFraction: 0, workers: 100, maxRate: 1000},
 			},
 			linkPhaseTimeout:     10 * time.Minute, // typically takes 5 minutes
 			downloadPhaseTimeout: 4 * time.Hour,    // typically takes 2 hours.
 		},
+		{
+			restoreSpecs: restoreSpecs{
+				hardware: makeHardwareSpecs(hardwareSpecs{
+					nodes: 10, volumeSize: 1500, workloadNode: true,
+				}),
+				backup: backupSpecs{
+					cloud:   spec.GCE,
+					fixture: MediumFixture,
+				},
+				timeout:        3 * time.Hour,
+				suites:         registry.Suites(registry.Nightly),
+				fullBackupOnly: true,
+			},
+			workload: tpccRestore{
+				opts: tpccRunOpts{waitFraction: 0, workers: 100, maxRate: 1000},
+			},
+			linkPhaseTimeout:      10 * time.Minute,
+			downloadPhaseTimeout:  4 * time.Hour,
+			compactionConcurrency: 32,
+		},
+		{
+			restoreSpecs: restoreSpecs{
+				hardware: makeHardwareSpecs(hardwareSpecs{
+					nodes: 10, volumeSize: 1500, workloadNode: true, ebsIOPS: 15_000, ebsThroughput: 800,
+				}),
+				backup: backupSpecs{
+					cloud:   spec.AWS,
+					fixture: MediumFixture,
+				},
+				timeout:        3 * time.Hour,
+				suites:         registry.Suites(registry.Nightly),
+				fullBackupOnly: true,
+			},
+			workload: tpccRestore{
+				opts: tpccRunOpts{waitFraction: 0, workers: 100, maxRate: 1000},
+			},
+			linkPhaseTimeout:      10 * time.Minute,
+			downloadPhaseTimeout:  4 * time.Hour,
+			compactionConcurrency: 32,
+		},
 	} {
 		for _, runOnline := range []bool{true, false} {
 			for _, useWorkarounds := range []bool{true, false} {
@@ -174,6 +220,26 @@ func registerOnlineRestorePerf(r registry.Registry) {
 					runOnline := runOnline
 					runWorkload := runWorkload
 					useWorkarounds := useWorkarounds
+					clusterSettings := []string{
+						// TODO(dt): what's the right value for this? How do we tune this
+						// on the fly automatically during the restore instead of by-hand?
+						// Context: We expect many operations to take longer than usual
+						// when some or all of the data they touch is remote. For now this
+						// is being blanket set to 1h manually, and a user's run-book
+						// would need to do this by hand before an online restore and
+						// reset it manually after, but ideally the queues would be aware
+						// of remote-ness when they pick their own timeouts and pick
+						// accordingly.
+						"kv.queue.process.guaranteed_time_budget='1h'",
+						// TODO(dt): AC appears periodically reduce the workload to 0 QPS
+						// during the download phase (sudden jumps from 0 to 2k qps to 0).
+						// Disable for now until we figure out how to smooth this out.
+						"admission.disk_bandwidth_tokens.elastic.enabled=false",
+						"admission.kv.enabled=false",
+						"admission.sql_kv_response.enabled=false",
+						"kv.consistency_queue.enabled=false",
+						"kv.range_merge.skip_external_bytes.enabled=true",
+					}
 
 					if runOnline {
 						sp.namePrefix = "online/"
@@ -187,10 +253,24 @@ func registerOnlineRestorePerf(r registry.Registry) {
 
 					sp.namePrefix = sp.namePrefix + fmt.Sprintf("workload=%t", runWorkload)
 					if !useWorkarounds {
+						clusterSettings = []string{}
 						sp.skip = "used for ad hoc experiments"
 						sp.namePrefix = sp.namePrefix + fmt.Sprintf("/workarounds=%t", useWorkarounds)
 					}
 
+					if sp.compactionConcurrency != 0 {
+						sp.namePrefix = sp.namePrefix + fmt.Sprintf(
+							"/compaction-concurrency=%d", sp.compactionConcurrency,
+						)
+						clusterSettings = append(
+							clusterSettings,
+							fmt.Sprintf(
+								"storage.max_download_compaction_concurrency=%d", sp.compactionConcurrency,
+							),
+						)
+						sp.skip = "used for ad hoc experiments"
+					}
+
 					if sp.skip == "" && !backuptestutils.IsOnlineRestoreSupported() {
 						sp.skip = "online restore is only tested on development branch"
 					}
@@ -215,7 +295,9 @@ func registerOnlineRestorePerf(r registry.Registry) {
 							rd := makeRestoreDriver(t, c, sp.restoreSpecs)
 							rd.prepareCluster(ctx)
 
-							restoreStats := runRestore(ctx, t, c, sp, rd, runOnline, runWorkload, useWorkarounds)
+							restoreStats := runRestore(
+								ctx, t, c, sp, rd, runOnline, runWorkload, clusterSettings...,
+							)
 							if runOnline {
 								require.NoError(t, postRestoreValidation(
 									ctx,
@@ -304,10 +386,7 @@ func registerOnlineRestoreCorrectness(r registry.Registry) {
 				rd := makeRestoreDriver(t, c, sp.restoreSpecs)
 				rd.prepareCluster(ctx)
 
-				runRestore(
-					ctx, t, c, regRestoreSpecs, rd,
-					false /* runOnline */, true /* runWorkload */, false, /* useWorkarounds */
-				)
+				runRestore(ctx, t, c, regRestoreSpecs, rd, false /* runOnline */, true /* runWorkload */)
 				details, err := c.RunWithDetails(
 					ctx,
 					t.L(),
@@ -320,10 +399,7 @@ func registerOnlineRestoreCorrectness(r registry.Registry) {
 				c.Wipe(ctx)
 				rd.prepareCluster(ctx)
 
-				runRestore(
-					ctx, t, c, orSpecs, rd,
-					true /* runOnline */, true /* runWorkload */, false, /* useWorkarounds */
-				)
+				runRestore(ctx, t, c, orSpecs, rd, true /* runOnline */, true /* runWorkload */)
 				details, err = c.RunWithDetails(
 					ctx,
 					t.L(),
@@ -577,13 +653,24 @@ type restoreStats struct {
 	workloadEndTime           time.Time
 }
 
+// runRestore runs restore based on the provided specs.
+//
+// If runOnline is set, online restore is run, otherwise a conventional restore
+// is run.
+//
+// If runWorkload is set, the workload is run during the download phase of the
+// restore.
+//
+// clusterSettings is a list of key=value pairs of cluster settings to set
+// before performing the restore.
 func runRestore(
 	ctx context.Context,
 	t test.Test,
 	c cluster.Cluster,
 	sp onlineRestoreSpecs,
 	rd restoreDriver,
-	runOnline, runWorkload, useWorkarounds bool,
+	runOnline, runWorkload bool,
+	clusterSettings ...string,
 ) restoreStats {
 	testStartTime := timeutil.Now()
 
@@ -598,36 +685,9 @@ func runRestore(
 			return err
 		}
 		defer db.Close()
-		if useWorkarounds {
-			// TODO(dt): what's the right value for this? How do we tune this
-			// on the fly automatically during the restore instead of by-hand?
-			// Context: We expect many operations to take longer than usual
-			// when some or all of the data they touch is remote. For now this
-			// is being blanket set to 1h manually, and a user's run-book
-			// would need to do this by hand before an online restore and
-			// reset it manually after, but ideally the queues would be aware
-			// of remote-ness when they pick their own timeouts and pick
-			// accordingly.
-			if _, err := db.Exec("SET CLUSTER SETTING kv.queue.process.guaranteed_time_budget='1h'"); err != nil {
-				return err
-			}
-			// TODO(dt): AC appears periodically reduce the workload to 0 QPS
-			// during the download phase (sudden jumps from 0 to 2k qps to 0).
-			// Disable for now until we figure out how to smooth this out.
-			if _, err := db.Exec("SET CLUSTER SETTING admission.disk_bandwidth_tokens.elastic.enabled=false"); err != nil {
-				return err
-			}
-			if _, err := db.Exec("SET CLUSTER SETTING admission.kv.enabled=false"); err != nil {
-				return err
-			}
-			if _, err := db.Exec("SET CLUSTER SETTING admission.sql_kv_response.enabled=false"); err != nil {
-				return err
-			}
-			if _, err := db.Exec("SET CLUSTER SETTING kv.consistency_queue.enabled=false"); err != nil {
-				return err
-			}
-			if _, err := db.Exec("SET CLUSTER SETTING kv.range_merge.skip_external_bytes.enabled=true"); err != nil {
-				return err
+		for _, setting := range clusterSettings {
+			if _, err := db.Exec(fmt.Sprintf("SET CLUSTER SETTING %s", setting)); err != nil {
+				return errors.Wrapf(err, "failed to set cluster setting %s", setting)
 			}
 		}
 		opts := "WITH UNSAFE_RESTORE_INCOMPATIBLE_VERSION"
diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go
@@ -359,23 +359,33 @@ func registerRestore(r registry.Registry) {
 			timeout:  24 * time.Hour,
 			suites:   registry.Suites(registry.Weekly),
 		},
-		// Following two tests are just used to benchmark classic restore against
-		// OR with the exact same fixtures and hardware.
+		// OR Benchmarking tests
+		// See benchmark plan here: https://docs.google.com/spreadsheets/d/1uPcQ1YPohXKxwFxWWDUMJrYLKQOuqSZKVrI8SJam5n8
 		{
-			hardware:       makeHardwareSpecs(hardwareSpecs{}),
-			backup:         backupSpecs{cloud: spec.GCE, fixture: SmallFixture},
-			timeout:        1 * time.Hour,
-			suites:         registry.Suites(registry.Nightly),
+			hardware: makeHardwareSpecs(hardwareSpecs{
+				nodes: 10, volumeSize: 1500,
+			}),
+			backup: backupSpecs{
+				cloud:   spec.GCE,
+				fixture: MediumFixture,
+			},
+			timeout:        3 * time.Hour,
 			fullBackupOnly: true,
-			skip:           "used for adhoc benchmarking against OR",
+			suites:         registry.Suites(registry.Nightly),
+			skip:           "used for OR benchmarking purposes",
 		},
 		{
-			hardware:       makeHardwareSpecs(hardwareSpecs{nodes: 10, volumeSize: 1500, workloadNode: true}),
-			backup:         backupSpecs{cloud: spec.GCE, fixture: MediumFixture},
+			hardware: makeHardwareSpecs(hardwareSpecs{
+				nodes: 10, volumeSize: 1500, ebsIOPS: 15_000, ebsThroughput: 800,
+			}),
+			backup: backupSpecs{
+				cloud:   spec.AWS,
+				fixture: MediumFixture,
+			},
 			timeout:        3 * time.Hour,
-			suites:         registry.Suites(registry.Nightly),
 			fullBackupOnly: true,
-			skip:           "used for adhoc benchmarking against OR",
+			suites:         registry.Suites(registry.Nightly),
+			skip:           "used for OR benchmarking purposes",
 		},
 		// TODO(msbutler): add the following tests once roachperf/grafana is hooked up and old tests are
 		// removed:
@@ -483,9 +493,14 @@ type hardwareSpecs struct {
 	useLocalSSD bool
 
 	// ebsThroughput is the min provisioned throughput of the EBS volume, in MB/s.
-	// TODO(pavelkalinnikov): support provisioning throughput not only on EBS.
+	// Ignored if not running on AWS. Defaults to 125 MiB/s for the default gp3
+	// volume.
 	ebsThroughput int
 
+	// ebsIOPS is the configured IOPS for the EBS volume. Ignored if not running
+	// on AWS. Defaults to 3000 IOPS for the default gp3 volume.
+	ebsIOPS int
+
 	// mem is the memory per cpu.
 	mem spec.MemPerCPU
 
@@ -503,6 +518,9 @@ func (hw hardwareSpecs) makeClusterSpecs(r registry.Registry) spec.ClusterSpec {
 	if hw.ebsThroughput != 0 {
 		clusterOpts = append(clusterOpts, spec.AWSVolumeThroughput(hw.ebsThroughput))
 	}
+	if hw.ebsIOPS != 0 {
+		clusterOpts = append(clusterOpts, spec.AWSVolumeIOPS(hw.ebsIOPS))
+	}
 
 	if hw.useLocalSSD {
 		clusterOpts = append(clusterOpts, spec.PreferLocalSSD())
@@ -600,6 +618,9 @@ func makeHardwareSpecs(override hardwareSpecs) hardwareSpecs {
 	if override.ebsThroughput != 0 {
 		specs.ebsThroughput = override.ebsThroughput
 	}
+	if override.ebsIOPS != 0 {
+		specs.ebsIOPS = override.ebsIOPS
+	}
 	if specs.useLocalSSD {
 		specs.volumeSize = 0
 		specs.ebsThroughput = 0

Original file line number	Diff line number	Diff line change
`@@ -282,6 +282,14 @@ func AWSVolumeThroughput(throughput int) Option {`
`282`	`282`	`}`
`283`	`283`	`}`
`284`	`284`
	`285`	`+// AWSVolumeIOPS sets the provisioned IOPS for EBS volumes when the cluster is`
	`286`	`+// on AWS.`
	`287`	`+func AWSVolumeIOPS(iops int) Option {`
	`288`	`+ return func(spec *ClusterSpec) {`
	`289`	`+ spec.AWS.VolumeIOPS = iops`
	`290`	`+ }`
	`291`	`+}`
	`292`	`+`
`285`	`293`	`// AWSZones is a node option which requests Geo-distributed nodes; only applies`
`286`	`294`	`// when the test runs on AWS.`
`287`	`295`	`//`