@@ -52,9 +52,9 @@ const (
52
52
// IP address, a firewall rule, a target pool, and a forwarding rule. This
53
53
// function has to manage all of them.
54
54
//
55
- // Due to an interesting series of design decisions, this handles both creating
56
- // new load balancers and updating existing load balancers, recognizing when
57
- // each is needed .
55
+ // This function handles both creating new load balancers and updating existing load balancers,
56
+ // recognizing when each is needed.
57
+ // This approach is resilient, for example if we are interrupted part-way during creation .
58
58
func (g * Cloud ) ensureExternalLoadBalancer (clusterName string , clusterID string , apiService * v1.Service , existingFwdRule * compute.ForwardingRule , nodes []* v1.Node ) (* v1.LoadBalancerStatus , error ) {
59
59
// Process services with LoadBalancerClass "networking.gke.io/l4-regional-external-legacy" used for this controller.
60
60
// LoadBalancerClass can't be updated so we know this controller should process the NetLB.
@@ -253,11 +253,37 @@ func (g *Cloud) ensureExternalLoadBalancer(clusterName string, clusterID string,
253
253
}
254
254
}
255
255
256
- ipAddressToUse , isSafeToReleaseIP , err := g .ensureIPAddress (loadBalancerName , lbRefStr , requestedIP , fwdRuleIP , netTier )
257
- if err != nil {
258
- return nil , err
259
- }
256
+ // Make sure we know which IP address will be used and have properly reserved
257
+ // it as static before moving forward with the rest of our operations.
258
+ //
259
+ // We use static IP addresses when updating a load balancer to ensure that we
260
+ // can replace the load balancer's other components without changing the
261
+ // address its service is reachable on. We do it this way rather than always
262
+ // keeping the static IP around even though this is more complicated because
263
+ // it makes it less likely that we'll run into quota issues. Only 7 static
264
+ // IP addresses are allowed per region by default.
265
+ //
266
+ // We could let an IP be allocated for us when the forwarding rule is created,
267
+ // but we need the IP to set up the firewall rule, and we want to keep the
268
+ // forwarding rule creation as the last thing that needs to be done in this
269
+ // function in order to maintain the invariant that "if the forwarding rule
270
+ // exists, the LB has been fully created".
271
+ ipAddressToUse := ""
272
+
273
+ // Through this process we try to keep track of whether it is safe to
274
+ // release the IP that was allocated. If the user specifically asked for
275
+ // an IP, we assume they are managing it themselves. Otherwise, we will
276
+ // release the IP in case of early-terminating failure or upon successful
277
+ // creating of the LB.
278
+ // TODO(#36535): boil this logic down into a set of component functions
279
+ // and key the flag values off of errors returned.
280
+ isUserOwnedIP := false // if this is set, we never release the IP
281
+ isSafeToReleaseIP := false
282
+
260
283
defer func () {
284
+ if isUserOwnedIP {
285
+ return
286
+ }
261
287
if isSafeToReleaseIP {
262
288
if err := g .DeleteRegionAddress (loadBalancerName , g .region ); err != nil && ! isNotFound (err ) {
263
289
klog .Errorf ("ensureExternalLoadBalancer(%s): Failed to release static IP %s in region %v: %v." , lbRefStr , ipAddressToUse , g .region , err )
@@ -271,6 +297,36 @@ func (g *Cloud) ensureExternalLoadBalancer(clusterName string, clusterID string,
271
297
}
272
298
}()
273
299
300
+ if requestedIP != "" {
301
+ // If user requests a specific IP address, verify first. No mutation to
302
+ // the GCE resources will be performed in the verification process.
303
+ isUserOwnedIP , err = verifyUserRequestedIP (g , g .region , requestedIP , fwdRuleIP , lbRefStr , netTier )
304
+ if err != nil {
305
+ return nil , err
306
+ }
307
+ ipAddressToUse = requestedIP
308
+ }
309
+
310
+ if ! isUserOwnedIP {
311
+ // If we are not using the user-owned IP, either promote the
312
+ // emphemeral IP used by the fwd rule, or create a new static IP.
313
+ ipAddr , existed , err := ensureStaticIP (g , loadBalancerName , serviceName .String (), g .region , fwdRuleIP , netTier )
314
+ if err != nil {
315
+ return nil , fmt .Errorf ("failed to ensure a static IP for load balancer (%s): %v" , lbRefStr , err )
316
+ }
317
+ klog .Infof ("ensureExternalLoadBalancer(%s): Ensured IP address %s (tier: %s)." , lbRefStr , ipAddr , netTier )
318
+ // If the IP was not owned by the user, but it already existed, it
319
+ // could indicate that the previous update cycle failed. We can use
320
+ // this IP and try to run through the process again, but we should
321
+ // not release the IP unless it is explicitly flagged as OK.
322
+ isSafeToReleaseIP = ! existed
323
+ ipAddressToUse = ipAddr
324
+ }
325
+
326
+ // Deal with the firewall next. The reason we do this here rather than last
327
+ // is because the forwarding rule is used as the indicator that the load
328
+ // balancer is fully created - it's what getLoadBalancer checks for.
329
+ // Check if user specified the allow source range
274
330
sourceRanges , err := servicehelpers .GetLoadBalancerSourceRanges (apiService )
275
331
if err != nil {
276
332
return nil , err
@@ -308,7 +364,8 @@ func (g *Cloud) ensureExternalLoadBalancer(clusterName string, clusterID string,
308
364
klog .Infof ("ensureExternalLoadBalancer(%s): Target pool for service doesn't exist." , lbRefStr )
309
365
}
310
366
311
- // Health check logic...
367
+ // Check which health check needs to create and which health check needs to delete.
368
+ // Health check management is coupled with target pool operation to prevent leaking.
312
369
var hcToCreate , hcToDelete * compute.HttpHealthCheck
313
370
hcLocalTrafficExisting , err := g .GetHTTPHealthCheck (loadBalancerName )
314
371
if err != nil && ! isHTTPErrorCode (err , http .StatusNotFound ) {
@@ -317,6 +374,9 @@ func (g *Cloud) ensureExternalLoadBalancer(clusterName string, clusterID string,
317
374
if path , healthCheckNodePort := servicehelpers .GetServiceHealthCheckPathPort (apiService ); path != "" {
318
375
klog .V (4 ).Infof ("ensureExternalLoadBalancer(%s): Service needs local traffic health checks on: %d%s." , lbRefStr , healthCheckNodePort , path )
319
376
if hcLocalTrafficExisting == nil {
377
+ // This logic exists to detect a transition for non-OnlyLocal to OnlyLocal service
378
+ // turn on the tpNeedsRecreation flag to delete/recreate fwdrule/tpool updating the
379
+ // target pool to use local traffic health check.
320
380
klog .V (2 ).Infof ("ensureExternalLoadBalancer(%s): Updating from nodes health checks to local traffic health checks." , lbRefStr )
321
381
hcToDelete = makeHTTPHealthCheck (MakeNodesHealthCheckName (clusterID ), GetNodesHealthCheckPath (), GetNodesHealthCheckPort ())
322
382
tpNeedsRecreation = true
@@ -534,8 +594,13 @@ func (g *Cloud) ensureExternalLoadBalancerDeleted(clusterName, clusterID string,
534
594
},
535
595
func () error {
536
596
klog .Infof ("ensureExternalLoadBalancerDeleted(%s): Deleting forwarding rules." , lbRefStr )
537
- // The forwarding rule must be deleted before either the target pool can,
538
- // unfortunately, so we have to do these two serially.
597
+ // The forwarding rule must be deleted before the target pool can be deleted,
598
+ // unfortunately, so we have to delete forwarding rules then target pools serially.
599
+ if err := ignoreNotFound (g .DeleteRegionForwardingRule (loadBalancerName , g .region )); err != nil {
600
+ return err
601
+ }
602
+
603
+ // TODO: Always or just with alpha feature flag?
539
604
frs , err := g .ListRegionForwardingRules (g .region )
540
605
if err != nil {
541
606
return err
@@ -1125,8 +1190,7 @@ func equalPorts(existingPorts, newPorts []string, existingPortRange, newPortRang
1125
1190
1126
1191
func groupPortsByProtocol (ports []v1.ServicePort ) map [v1.Protocol ][]v1.ServicePort {
1127
1192
grouped := make (map [v1.Protocol ][]v1.ServicePort )
1128
- for _ , p := range ports {
1129
- port := p
1193
+ for _ , port := range ports {
1130
1194
grouped [port .Protocol ] = append (grouped [port .Protocol ], port )
1131
1195
}
1132
1196
return grouped
@@ -1195,7 +1259,6 @@ func (g *Cloud) firewallNeedsUpdate(name, serviceName, ipAddress string, ports [
1195
1259
return true , false , nil
1196
1260
}
1197
1261
1198
-
1199
1262
func (g * Cloud ) ensureHTTPHealthCheckFirewall (svc * v1.Service , serviceName , ipAddress , region , clusterID string , hosts []* gceInstance , hcName string , hcPort int32 , isNodesHealthCheck bool ) error {
1200
1263
// Prepare the firewall params for creating / checking.
1201
1264
desc := fmt .Sprintf (`{"kubernetes.io/cluster-id":"%s"}` , clusterID )
@@ -1310,6 +1373,8 @@ func (g *Cloud) updateFirewall(svc *v1.Service, name, desc, destinationIP string
1310
1373
func (g * Cloud ) firewallObject (name , desc , destinationIP string , sourceRanges utilnet.IPNetSet , ports []v1.ServicePort , hosts []* gceInstance ) (* compute.Firewall , error ) {
1311
1374
// destinationIP can be empty string "" and this means that it is not set.
1312
1375
// GCE considers empty destinationRanges as "all" for ingress firewall-rules.
1376
+ // Concatenate service ports into port ranges. This help to workaround the gce firewall limitation where only
1377
+ // 100 ports or port ranges can be used in a firewall rule.
1313
1378
groupedPorts := groupPortsByProtocol (ports )
1314
1379
var allowed []* compute.FirewallAllowed
1315
1380
for protocol , protocolPorts := range groupedPorts {
0 commit comments