Skip to content

Commit 802ecdc

Browse files
craig[bot]herkolategan
andcommitted
Merge #149795
149795: roachprod: limit max concurrent hosts in flight r=herkolategan a=herkolategan Previously, creating a large number of hosts (> 200) would cause the command to fail, due to the number of hosts in flight exceeding what `gcloud` can handle. Hosts were split and parallelized by zone, but if a large number of hosts are all part of the same zone it would cause issues. This change adds two new constrains. The number of concurrent commands for VM creation, and also the maximum number of hosts in flight at any time. The second limit is to prevent overloading the gcloud API. Epic: None Release note: None Co-authored-by: Herko Lategan <[email protected]>
2 parents 5949c6b + 5e1d449 commit 802ecdc

File tree

2 files changed

+85
-47
lines changed

2 files changed

+85
-47
lines changed

pkg/roachprod/vm/gce/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ go_library(
2323
"//pkg/util/syncutil",
2424
"//pkg/util/timeutil",
2525
"@com_github_cockroachdb_errors//:errors",
26+
"@com_github_marusama_semaphore//:semaphore",
2627
"@com_github_masterminds_semver_v3//:semver",
2728
"@com_github_spf13_pflag//:pflag",
2829
"@org_golang_google_api//cloudbilling/v1beta",

pkg/roachprod/vm/gce/gcloud.go

Lines changed: 84 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
3030
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
3131
"github.com/cockroachdb/errors"
32+
"github.com/marusama/semaphore"
3233
"github.com/spf13/pflag"
3334
"golang.org/x/exp/maps"
3435
"golang.org/x/sync/errgroup"
@@ -44,7 +45,12 @@ const (
4445
defaultImageProject = "ubuntu-os-cloud"
4546
FIPSImageProject = "ubuntu-os-pro-cloud"
4647
ManagedLabel = "managed"
47-
MaxConcurrentVMOps = 16
48+
49+
// These values limit concurrent `gcloud` CLI operations, and command
50+
// length, to avoid overwhelming the API when managing large clusters. The
51+
// limits were determined through empirical testing.
52+
MaxConcurrentCommands = 100
53+
MaxConcurrentHosts = 100
4854
)
4955

5056
type VolumeType string
@@ -1213,6 +1219,14 @@ func (p *Provider) ConfigureProviderFlags(flags *pflag.FlagSet, opt vm.MultipleP
12131219
)
12141220
}
12151221

1222+
// newLimitedErrorGroup creates an `errgroup.Group` with the cloud provider's
1223+
// default limit on the number of concurrent operations.
1224+
func newLimitedErrorGroup() *errgroup.Group {
1225+
g := &errgroup.Group{}
1226+
g.SetLimit(MaxConcurrentCommands)
1227+
return g
1228+
}
1229+
12161230
// useArmAMI returns true if the machine type is an arm64 machine type.
12171231
func (o *ProviderOpts) useArmAMI() bool {
12181232
return strings.HasPrefix(strings.ToLower(o.MachineType), "t2a-")
@@ -1265,8 +1279,7 @@ func (p *Provider) editLabels(
12651279
tagArgsString := strings.Join(tagArgs, ",")
12661280
commonArgs := []string{"--project", p.GetProject(), fmt.Sprintf("--labels=%s", tagArgsString)}
12671281

1268-
var g errgroup.Group
1269-
g.SetLimit(MaxConcurrentVMOps)
1282+
g := newLimitedErrorGroup()
12701283
for _, v := range vms {
12711284
vmArgs := make([]string, len(cmdArgs))
12721285
copy(vmArgs, cmdArgs)
@@ -1770,30 +1783,41 @@ func (p *Provider) Create(
17701783
}
17711784

17721785
default:
1773-
var g errgroup.Group
1786+
g := newLimitedErrorGroup()
17741787
createArgs := []string{"compute", "instances", "create", "--subnet", "default", "--format", "json"}
17751788
createArgs = append(createArgs, "--labels", labels)
17761789
createArgs = append(createArgs, instanceArgs...)
17771790

1791+
sem := semaphore.New(MaxConcurrentHosts)
17781792
l.Printf("Creating %d instances, distributed across [%s]", len(names), strings.Join(usedZones, ", "))
17791793
for zone, zoneHosts := range zoneToHostNames {
1780-
argsWithZone := append(createArgs, "--zone", zone)
1781-
argsWithZone = append(argsWithZone, zoneHosts...)
1782-
g.Go(func() error {
1783-
var instances []jsonVM
1784-
err := runJSONCommand(argsWithZone, &instances)
1785-
if err != nil {
1786-
return errors.Wrapf(err, "Command: gcloud %s", argsWithZone)
1787-
}
1788-
vmListMutex.Lock()
1789-
defer vmListMutex.Unlock()
1790-
for _, i := range instances {
1791-
v := i.toVM(project, p.publicDomain)
1792-
vmList = append(vmList, *v)
1793-
}
1794-
return nil
1795-
})
1794+
groupSize := MaxConcurrentHosts / 4
1795+
for i := 0; i < len(zoneHosts); i += groupSize {
1796+
hostGroup := zoneHosts[i:min(i+groupSize, len(zoneHosts))]
1797+
argsWithZone := append(createArgs, "--zone", zone)
1798+
argsWithZone = append(argsWithZone, hostGroup...)
17961799

1800+
g.Go(func() error {
1801+
err := sem.Acquire(context.Background(), len(hostGroup))
1802+
if err != nil {
1803+
return errors.Wrapf(err, "Failed to acquire semaphore")
1804+
}
1805+
defer sem.Release(len(hostGroup))
1806+
1807+
var instances []jsonVM
1808+
err = runJSONCommand(argsWithZone, &instances)
1809+
if err != nil {
1810+
return errors.Wrapf(err, "Command: gcloud %s", argsWithZone)
1811+
}
1812+
vmListMutex.Lock()
1813+
defer vmListMutex.Unlock()
1814+
for _, i := range instances {
1815+
v := i.toVM(project, p.publicDomain)
1816+
vmList = append(vmList, *v)
1817+
}
1818+
return nil
1819+
})
1820+
}
17971821
}
17981822
err = g.Wait()
17991823
if err != nil {
@@ -1867,7 +1891,7 @@ func (p *Provider) Shrink(l *logger.Logger, vmsToDelete vm.List, clusterName str
18671891
vmZones[cVM.Zone] = append(vmZones[cVM.Zone], cVM)
18681892
}
18691893

1870-
g := errgroup.Group{}
1894+
g := newLimitedErrorGroup()
18711895
for zone, vms := range vmZones {
18721896
instances := vms.Names()
18731897
args := []string{"compute", "instance-groups", "managed", "delete-instances",
@@ -1911,7 +1935,7 @@ func (p *Provider) Grow(
19111935
zoneToHostNames := computeHostNamesPerZone(groups, names, newNodeCount)
19121936

19131937
addedVms := make(map[string]bool)
1914-
var g errgroup.Group
1938+
g := newLimitedErrorGroup()
19151939
for _, group := range groups {
19161940
createArgs := []string{"compute", "instance-groups", "managed", "create-instance", "--zone", group.Zone, groupName,
19171941
"--project", project}
@@ -2051,7 +2075,7 @@ func listHealthChecks(project string) ([]jsonHealthCheck, error) {
20512075
// all of them. Health checks associated with the cluster are also deleted.
20522076
func deleteLoadBalancerResources(project, clusterName, portFilter string) error {
20532077
// List all the components of the load balancer resources tied to the project.
2054-
var g errgroup.Group
2078+
g := newLimitedErrorGroup()
20552079
var services []jsonBackendService
20562080
var proxies []jsonTargetTCPProxy
20572081
var rules []jsonForwardingRule
@@ -2120,7 +2144,7 @@ func deleteLoadBalancerResources(project, clusterName, portFilter string) error
21202144

21212145
// Delete all the components of the load balancer. Resources must be deleted
21222146
// in the correct order to avoid dependency errors.
2123-
g = errgroup.Group{}
2147+
g = newLimitedErrorGroup()
21242148
for _, rule := range filteredForwardingRules {
21252149
args := []string{"compute", "forwarding-rules", "delete",
21262150
rule.Name,
@@ -2140,7 +2164,7 @@ func deleteLoadBalancerResources(project, clusterName, portFilter string) error
21402164
if err := g.Wait(); err != nil {
21412165
return err
21422166
}
2143-
g = errgroup.Group{}
2167+
g = newLimitedErrorGroup()
21442168
for _, proxy := range filteredProxies {
21452169
args := []string{"compute", "target-tcp-proxies", "delete",
21462170
proxy.Name,
@@ -2159,7 +2183,7 @@ func deleteLoadBalancerResources(project, clusterName, portFilter string) error
21592183
if err := g.Wait(); err != nil {
21602184
return err
21612185
}
2162-
g = errgroup.Group{}
2186+
g = newLimitedErrorGroup()
21632187
for _, service := range filteredServices {
21642188
args := []string{"compute", "backend-services", "delete",
21652189
service.Name,
@@ -2179,7 +2203,7 @@ func deleteLoadBalancerResources(project, clusterName, portFilter string) error
21792203
if err := g.Wait(); err != nil {
21802204
return err
21812205
}
2182-
g = errgroup.Group{}
2206+
g = newLimitedErrorGroup()
21832207
for _, healthCheck := range filteredHealthChecks {
21842208
args := []string{"compute", "health-checks", "delete",
21852209
healthCheck.Name,
@@ -2462,7 +2486,7 @@ func propagateDiskLabels(
24622486
useLocalSSD bool,
24632487
pdVolumeCount int,
24642488
) error {
2465-
var g errgroup.Group
2489+
g := newLimitedErrorGroup()
24662490

24672491
l.Printf("Propagating labels across all disks")
24682492
argsPrefix := []string{"compute", "disks", "update"}
@@ -2849,7 +2873,7 @@ func (p *Provider) deleteManaged(l *logger.Logger, vms vm.List) error {
28492873
clusterProjectMap[clusterName] = v.Project
28502874
}
28512875

2852-
var g errgroup.Group
2876+
g := newLimitedErrorGroup()
28532877
for cluster, project := range clusterProjectMap {
28542878
// Delete any load balancer resources associated with the cluster. Trying to
28552879
// delete the instance group before the load balancer resources will result
@@ -2885,7 +2909,7 @@ func (p *Provider) deleteManaged(l *logger.Logger, vms vm.List) error {
28852909

28862910
// All instance groups have to be deleted before the instance templates can be
28872911
// deleted.
2888-
g = errgroup.Group{}
2912+
g = newLimitedErrorGroup()
28892913
for cluster, project := range clusterProjectMap {
28902914
templates, err := listInstanceTemplates(project, cluster)
28912915
if err != nil {
@@ -2912,29 +2936,42 @@ func (p *Provider) deleteUnmanaged(l *logger.Logger, vms vm.List) error {
29122936
projectZoneMap[v.Project][v.Zone] = append(projectZoneMap[v.Project][v.Zone], v.Name)
29132937
}
29142938

2915-
var g errgroup.Group
2939+
g := newLimitedErrorGroup()
29162940
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
29172941
defer cancel()
2942+
2943+
sem := semaphore.New(MaxConcurrentHosts)
29182944
for project, zoneMap := range projectZoneMap {
29192945
for zone, names := range zoneMap {
2920-
args := []string{
2921-
"compute", "instances", "delete",
2922-
"--delete-disks", "all",
2923-
}
2946+
groupSize := MaxConcurrentHosts / 4
2947+
for i := 0; i < len(names); i += groupSize {
2948+
nameGroup := names[i:min(i+groupSize, len(names))]
29242949

2925-
args = append(args, "--project", project)
2926-
args = append(args, "--zone", zone)
2927-
args = append(args, names...)
2950+
args := []string{
2951+
"compute", "instances", "delete",
2952+
"--delete-disks", "all",
2953+
}
29282954

2929-
g.Go(func() error {
2930-
cmd := exec.CommandContext(ctx, "gcloud", args...)
2955+
args = append(args, "--project", project)
2956+
args = append(args, "--zone", zone)
2957+
args = append(args, nameGroup...)
29312958

2932-
output, err := cmd.CombinedOutput()
2933-
if err != nil {
2934-
return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
2935-
}
2936-
return nil
2937-
})
2959+
g.Go(func() error {
2960+
err := sem.Acquire(ctx, len(nameGroup))
2961+
if err != nil {
2962+
return errors.Wrapf(err, "Failed to acquire semaphore")
2963+
}
2964+
defer sem.Release(len(nameGroup))
2965+
2966+
cmd := exec.CommandContext(ctx, "gcloud", args...)
2967+
2968+
output, err := cmd.CombinedOutput()
2969+
if err != nil {
2970+
return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output)
2971+
}
2972+
return nil
2973+
})
2974+
}
29382975
}
29392976
}
29402977

@@ -2956,7 +2993,7 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error {
29562993
projectZoneMap[v.Project][v.Zone] = append(projectZoneMap[v.Project][v.Zone], v.Name)
29572994
}
29582995

2959-
var g errgroup.Group
2996+
g := newLimitedErrorGroup()
29602997
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
29612998
defer cancel()
29622999
for project, zoneMap := range projectZoneMap {

0 commit comments

Comments
 (0)