Skip to content

Commit 53a2a95

Browse files
craig[bot]herkolategan
andcommitted
Merge #153952
153952: roachrest, roachprod: support no disks on workload machines r=golgeek a=herkolategan Previously, workload machines automatically inherited the same disk setup as the rest of the cluster. To optimize for cost, we now allow workload machines to have only boot disks. The default behavior, for providers, is to attach a standard data disk, or a number of local SSDs depending on the cluster spec. Thus in order to explicitly prevent any data disk from being configured , a new provider option "BootDiskOnly" has been added to prevent the default behavior, specifically for workload machines. Epic: None Release note: None Co-authored-by: Herko Lategan <[email protected]>
2 parents 5cda21f + dc9293e commit 53a2a95

File tree

19 files changed

+185
-77
lines changed

19 files changed

+185
-77
lines changed

pkg/cmd/roachtest/spec/cluster_spec.go

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -195,9 +195,10 @@ type ClusterSpec struct {
195195
// treated as workload node. Defaults to a VM with 4 CPUs if not specified
196196
// by WorkloadNodeCPUs.
197197
// TODO(GouravKumar): remove use of WorkloadNode, use WorkloadNodeCount instead
198-
WorkloadNode bool
199-
WorkloadNodeCount int
200-
WorkloadNodeCPUs int
198+
WorkloadNode bool
199+
WorkloadNodeCount int
200+
WorkloadNodeCPUs int
201+
WorkloadRequiresDisk bool
201202
// CPUs is the number of CPUs per node.
202203
CPUs int
203204
Mem MemPerCPU
@@ -317,7 +318,12 @@ func awsMachineSupportsSSD(machineType string) bool {
317318
}
318319

319320
func getAWSOpts(
320-
machineType string, volumeSize, ebsThroughput int, ebsIOPS int, localSSD bool, useSpotVMs bool,
321+
machineType string,
322+
volumeSize, ebsThroughput int,
323+
ebsIOPS int,
324+
localSSD bool,
325+
useSpotVMs bool,
326+
bootDiskOnly bool,
321327
) vm.ProviderOpts {
322328
opts := aws.DefaultProviderOpts()
323329
if volumeSize != 0 {
@@ -335,6 +341,7 @@ func getAWSOpts(
335341
opts.MachineType = machineType
336342
}
337343
opts.UseSpot = useSpotVMs
344+
opts.BootDiskOnly = bootDiskOnly
338345
return opts
339346
}
340347

@@ -349,6 +356,7 @@ func getGCEOpts(
349356
volumeType string,
350357
volumeCount int,
351358
useSpot bool,
359+
bootDiskOnly bool,
352360
) vm.ProviderOpts {
353361
opts := gce.DefaultProviderOpts()
354362
opts.MachineType = machineType
@@ -377,16 +385,17 @@ func getGCEOpts(
377385
if volumeType != "" {
378386
opts.PDVolumeType = volumeType
379387
}
380-
388+
opts.BootDiskOnly = bootDiskOnly
381389
return opts
382390
}
383391

384-
func getAzureOpts(machineType string, volumeSize int) vm.ProviderOpts {
392+
func getAzureOpts(machineType string, volumeSize int, bootDiskOnly bool) vm.ProviderOpts {
385393
opts := azure.DefaultProviderOpts()
386394
opts.MachineType = machineType
387395
if volumeSize != 0 {
388396
opts.NetworkDiskSize = int32(volumeSize)
389397
}
398+
opts.BootDiskOnly = bootDiskOnly
390399
return opts
391400
}
392401

@@ -398,6 +407,7 @@ func getIBMOpts(
398407
volumeIOPS int,
399408
extraVolumeCount int,
400409
RAID0 bool,
410+
bootDiskOnly bool,
401411
) vm.ProviderOpts {
402412
opts := ibm.DefaultProviderOpts()
403413
opts.MachineType = machineType
@@ -425,6 +435,7 @@ func getIBMOpts(
425435
}
426436
opts.UseMultipleDisks = !RAID0
427437
}
438+
opts.BootDiskOnly = bootDiskOnly
428439

429440
return opts
430441
}
@@ -601,7 +612,7 @@ func (s *ClusterSpec) RoachprodOpts(
601612
var err error
602613
switch cloud {
603614
case AWS:
604-
workloadMachineType, _, err = SelectAWSMachineType(s.WorkloadNodeCPUs, s.Mem, preferLocalSSD && s.VolumeSize == 0, selectedArch)
615+
workloadMachineType, _, err = SelectAWSMachineType(s.WorkloadNodeCPUs, s.Mem, false, selectedArch)
605616
case GCE:
606617
workloadMachineType, _ = SelectGCEMachineType(s.WorkloadNodeCPUs, s.Mem, selectedArch)
607618
case Azure:
@@ -623,27 +634,29 @@ func (s *ClusterSpec) RoachprodOpts(
623634
switch cloud {
624635
case AWS:
625636
providerOpts = getAWSOpts(machineType, s.VolumeSize, s.AWS.VolumeThroughput, s.AWS.VolumeIOPS,
626-
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
627-
workloadProviderOpts = getAWSOpts(workloadMachineType, s.VolumeSize, s.AWS.VolumeThroughput,
628-
s.AWS.VolumeIOPS, createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
637+
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs, false)
638+
workloadProviderOpts = getAWSOpts(workloadMachineType, s.VolumeSize, s.AWS.VolumeThroughput, s.AWS.VolumeIOPS,
639+
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs, !s.WorkloadRequiresDisk)
629640
case GCE:
630641
providerOpts = getGCEOpts(machineType, s.VolumeSize, ssdCount,
631642
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
632-
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType, s.GCE.VolumeCount, s.UseSpotVMs,
643+
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType,
644+
s.GCE.VolumeCount, s.UseSpotVMs, false,
633645
)
634646
workloadProviderOpts = getGCEOpts(workloadMachineType, s.VolumeSize, ssdCount,
635647
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
636-
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType, s.GCE.VolumeCount, s.UseSpotVMs,
648+
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType,
649+
s.GCE.VolumeCount, s.UseSpotVMs, !s.WorkloadRequiresDisk,
637650
)
638651
case Azure:
639-
providerOpts = getAzureOpts(machineType, s.VolumeSize)
640-
workloadProviderOpts = getAzureOpts(workloadMachineType, s.VolumeSize)
652+
providerOpts = getAzureOpts(machineType, s.VolumeSize, false)
653+
workloadProviderOpts = getAzureOpts(workloadMachineType, s.VolumeSize, true)
641654
case IBM:
642655
providerOpts = getIBMOpts(machineType, s.TerminateOnMigration, s.VolumeSize,
643-
s.IBM.VolumeType, s.IBM.VolumeIOPS, s.IBM.VolumeCount, s.RAID0,
656+
s.IBM.VolumeType, s.IBM.VolumeIOPS, s.IBM.VolumeCount, s.RAID0, false,
644657
)
645658
workloadProviderOpts = getIBMOpts(workloadMachineType, s.TerminateOnMigration, s.VolumeSize,
646-
s.IBM.VolumeType, s.IBM.VolumeIOPS, s.IBM.VolumeCount, s.RAID0,
659+
s.IBM.VolumeType, s.IBM.VolumeIOPS, s.IBM.VolumeCount, s.RAID0, true,
647660
)
648661
}
649662

pkg/cmd/roachtest/spec/option.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,15 @@ func WorkloadNodeCPU(n int) Option {
5353
}
5454
}
5555

56+
// WorkloadRequiresDisk should be used if the workload nodes should have the
57+
// exact same disk configuration as the rest of the cluster. Otherwise, all
58+
// workload nodes only have a boot disk.
59+
func WorkloadRequiresDisk() Option {
60+
return func(spec *ClusterSpec) {
61+
spec.WorkloadRequiresDisk = true
62+
}
63+
}
64+
5665
// Mem requests nodes with low/standard/high ratio of memory per CPU.
5766
func Mem(level MemPerCPU) Option {
5867
return func(spec *ClusterSpec) {

pkg/roachprod/install/cluster_synced.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1085,7 +1085,10 @@ func (c *SyncedCluster) Wait(ctx context.Context, l *logger.Logger) error {
10851085
func(ctx context.Context, node Node) (*RunResultDetails, error) {
10861086
res := &RunResultDetails{Node: node}
10871087
var err error
1088-
cmd := fmt.Sprintf("test -e %s -a -e %s", vm.DisksInitializedFile, vm.OSInitializedFile)
1088+
// Only the `vm.OSInitializedFile` file is checked and not the
1089+
// `vm.DisksInitializedFile`, because it's possible to create VMs without
1090+
// any attached disks other than the boot disk.
1091+
cmd := fmt.Sprintf("test -e %s", vm.OSInitializedFile)
10891092
// N.B. we disable ssh debug output capture, lest we end up with _thousands_ of useless .log files.
10901093
opts := cmdOptsWithDebugDisabled()
10911094
for j := 0; j < 600; j++ {

pkg/roachprod/vm/aws/aws.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,9 @@ type ProviderOpts struct {
279279
// use spot vms, spot vms are significantly cheaper, but can be preempted AWS.
280280
// see https://aws.amazon.com/ec2/spot/ for more details.
281281
UseSpot bool
282+
// BootDiskOnly ensures that no additional disks will be attached, other than
283+
// the boot disk.
284+
BootDiskOnly bool
282285
}
283286

284287
// Provider implements the vm.Provider interface for AWS.
@@ -540,6 +543,8 @@ func (o *ProviderOpts) ConfigureCreateFlags(flags *pflag.FlagSet) {
540543
false, "use AWS Spot VMs, which are significantly cheaper, but can be preempted by AWS.")
541544
flags.StringVar(&o.IAMProfile, ProviderName+"-iam-profile", o.IAMProfile,
542545
"the IAM instance profile to associate with created VMs if non-empty")
546+
flags.BoolVar(&o.BootDiskOnly, ProviderName+"-boot-disk-only", o.BootDiskOnly,
547+
"Only attach the boot disk. No additional volumes will be provisioned even if specified.")
543548
}
544549

545550
// ConfigureClusterCleanupFlags implements ProviderOpts.
@@ -1348,13 +1353,15 @@ func (p *Provider) runInstance(
13481353
extraMountOpts = "nobarrier"
13491354
}
13501355
}
1356+
13511357
filename, err := writeStartupScript(
13521358
name,
13531359
extraMountOpts,
13541360
opts.SSDOpts.FileSystem,
13551361
providerOpts.UseMultipleDisks,
13561362
opts.Arch == string(vm.ArchFIPS),
13571363
providerOpts.RemoteUserName,
1364+
providerOpts.BootDiskOnly,
13581365
)
13591366
if err != nil {
13601367
return nil, errors.Wrapf(err, "could not write AWS startup script to temp file")
@@ -1605,7 +1612,7 @@ func assignEBSVolumes(opts *vm.CreateOpts, providerOpts *ProviderOpts) ebsVolume
16051612
// Make a local copy of providerOpts.EBSVolumes to prevent data races
16061613
ebsVolumes := providerOpts.EBSVolumes
16071614
// The local NVMe devices are automatically mapped. Otherwise, we need to map an EBS data volume.
1608-
if !opts.SSDOpts.UseLocalSSD {
1615+
if !opts.SSDOpts.UseLocalSSD && !providerOpts.BootDiskOnly {
16091616
if len(ebsVolumes) == 0 && providerOpts.DefaultEBSVolume.Disk.VolumeType == "" {
16101617
providerOpts.DefaultEBSVolume.Disk.VolumeType = defaultEBSVolumeType
16111618
providerOpts.DefaultEBSVolume.Disk.DeleteOnTermination = true

pkg/roachprod/vm/aws/support.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ const awsStartupScriptTemplate = `#!/usr/bin/env bash
3030
# Script for setting up a AWS machine for roachprod use.
3131
3232
function setup_disks() {
33+
{{ if .BootDiskOnly }}
34+
mkdir -p /mnt/data1 && chmod 777 /mnt/data1
35+
echo "VM has no disk attached other than the boot disk."
36+
return 0
37+
{{ end }}
38+
3339
{{ if not .Zfs }}
3440
mount_opts="defaults,nofail"
3541
{{if .ExtraMountOpts}}mount_opts="${mount_opts},{{.ExtraMountOpts}}"{{end}}
@@ -181,11 +187,13 @@ func writeStartupScript(
181187
useMultiple bool,
182188
enableFips bool,
183189
remoteUser string,
190+
bootDiskOnly bool,
184191
) (string, error) {
185192
type tmplParams struct {
186193
vm.StartupArgs
187194
ExtraMountOpts string
188195
UseMultipleDisks bool
196+
BootDiskOnly bool
189197
}
190198

191199
args := tmplParams{
@@ -198,6 +206,7 @@ func writeStartupScript(
198206
),
199207
ExtraMountOpts: extraMountOpts,
200208
UseMultipleDisks: useMultiple,
209+
BootDiskOnly: bootDiskOnly,
201210
}
202211

203212
tmpfile, err := os.CreateTemp("", "aws-startup-script")

pkg/roachprod/vm/aws/support_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ import (
1717

1818
// TestWriteStartupScriptTemplate mainly tests the startup script tpl compiles.
1919
func TestWriteStartupScriptTemplate(t *testing.T) {
20-
file, err := writeStartupScript("vm_name", "", vm.Zfs, false, false, "ubuntu")
20+
file, err := writeStartupScript("vm_name", "", vm.Zfs, false,
21+
false, "ubuntu", false)
2122
require.NoError(t, err)
2223

2324
f, err := os.ReadFile(file)

pkg/roachprod/vm/aws/testdata/startup_script

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ echo
77
function setup_disks() {
88

99

10+
11+
1012
use_multiple_disks=''
1113

1214
mount_prefix="/mnt/data"

pkg/roachprod/vm/azure/azure.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,9 @@ func (p *Provider) createVM(
931931
startupArgs.AttachedDiskLun = &lun
932932
}
933933

934+
// Check if we only require a boot disk (workload only machines).
935+
startupArgs.BootDiskOnly = providerOpts.BootDiskOnly
936+
934937
startupScript, err := evalStartupTemplate(startupArgs)
935938
if err != nil {
936939
return machine, err
@@ -1058,7 +1061,7 @@ func (p *Provider) createVM(
10581061
machine.VirtualMachineProperties.StorageProfile.DiskControllerType = compute.NVMe
10591062
}
10601063

1061-
if !opts.SSDOpts.UseLocalSSD {
1064+
if !opts.SSDOpts.UseLocalSSD && !providerOpts.BootDiskOnly {
10621065
caching := compute.CachingTypesNone
10631066

10641067
switch providerOpts.DiskCaching {

pkg/roachprod/vm/azure/flags.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ type ProviderOpts struct {
2626
NetworkDiskSize int32
2727
UltraDiskIOPS int64
2828
DiskCaching string
29+
BootDiskOnly bool
2930
}
3031

3132
// These default locations support availability zones. At the time of
@@ -90,4 +91,6 @@ func (o *ProviderOpts) ConfigureCreateFlags(flags *pflag.FlagSet) {
9091
"Number of IOPS provisioned for ultra disk, only used if network-disk-type=ultra-disk")
9192
flags.StringVar(&o.DiskCaching, ProviderName+"-disk-caching", "none",
9293
"Disk caching behavior for attached storage. Valid values are: none, read-only, read-write. Not applicable to Ultra disks.")
94+
flags.BoolVar(&o.BootDiskOnly, ProviderName+"-boot-disk-only", o.BootDiskOnly,
95+
"Only attach the boot disk. No additional volumes will be provisioned even if specified.")
9396
}

pkg/roachprod/vm/azure/testdata/startup_script

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ echo
66
# Script for setting up a Azure machine for roachprod use.
77

88
function setup_disks() {
9-
mount_opts="defaults"
9+
10+
11+
mount_opts="defaults"
1012

1113
devices=()
1214

0 commit comments

Comments
 (0)