Skip to content

Commit 8fbe8d2

Browse files
authored
Merge pull request #617 from kisieland/add-exp-backoff
Add Exponential Backoff on the ListReferrers GCE API calls.
2 parents b55897d + b695996 commit 8fbe8d2

File tree

4 files changed

+284
-107
lines changed

4 files changed

+284
-107
lines changed

cmd/gcp-controller-manager/loops.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,19 @@ import (
3636
)
3737

3838
type controllerContext struct {
39-
client clientset.Interface
40-
sharedInformers informers.SharedInformerFactory
41-
recorder record.EventRecorder
42-
gcpCfg gcpConfig
43-
clusterSigningGKEKubeconfig string
44-
csrApproverVerifyClusterMembership bool
45-
csrApproverAllowLegacyKubelet bool
46-
csrApproverUseGCEInstanceListReferrers bool
47-
authAuthorizeServiceAccountMappingURL string
48-
authSyncNodeURL string
49-
hmsAuthorizeSAMappingURL string
50-
hmsSyncNodeURL string
51-
clearStalePodsOnNodeRegistration bool
39+
client clientset.Interface
40+
sharedInformers informers.SharedInformerFactory
41+
recorder record.EventRecorder
42+
gcpCfg gcpConfig
43+
clusterSigningGKEKubeconfig string
44+
csrApproverVerifyClusterMembership bool
45+
csrApproverAllowLegacyKubelet bool
46+
csrApproverListReferrersConfig gceInstanceListReferrersConfig
47+
authAuthorizeServiceAccountMappingURL string
48+
authSyncNodeURL string
49+
hmsAuthorizeSAMappingURL string
50+
hmsSyncNodeURL string
51+
clearStalePodsOnNodeRegistration bool
5252
}
5353

5454
// loops returns all the control loops that the GCPControllerManager can start.

cmd/gcp-controller-manager/main.go

Lines changed: 71 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -62,24 +62,26 @@ const (
6262
)
6363

6464
var (
65-
port = pflag.Int("port", 8089, "Port to serve status endpoints on (such as /healthz and /metrics).")
66-
metricsPort = pflag.Int("metrics-port", 8089, "Deprecated. Port to expose Prometheus metrics on. If not set, uses the value of --port.")
67-
kubeconfig = pflag.String("kubeconfig", "", "Path to kubeconfig file with authorization and master location information.")
68-
clusterSigningGKEKubeconfig = pflag.String("cluster-signing-gke-kubeconfig", "", "If set, use the kubeconfig file to call GKE to sign cluster-scoped certificates instead of using a local private key.")
69-
gceConfigPath = pflag.String("gce-config", "/etc/gce.conf", "Path to gce.conf.")
70-
controllers = pflag.StringSlice("controllers", []string{"*"}, "Controllers to enable. Possible controllers are: "+strings.Join(loopNames(), ",")+".")
71-
csrApproverVerifyClusterMembership = pflag.Bool("csr-validate-cluster-membership", true, "Validate that VMs requesting CSRs belong to current GKE cluster.")
72-
csrApproverAllowLegacyKubelet = pflag.Bool("csr-allow-legacy-kubelet", true, "Allow legacy kubelet bootstrap flow.")
73-
csrApproverUseGCEInstanceListReferrers = pflag.Bool("csr-use-gce-instance-list-referrers", false, "If true use https://cloud.google.com/compute/docs/reference/rest/v1/instances/listReferrers to validate instance cluster membership.")
74-
gceAPIEndpointOverride = pflag.String("gce-api-endpoint-override", "", "If set, talks to a different GCE API Endpoint. By default it talks to https://www.googleapis.com/compute/v1/projects/")
75-
directPath = pflag.Bool("direct-path", false, "Enable Direct Path.")
76-
authAuthorizeServiceAccountMappingURL = pflag.String("auth-authorize-service-account-mapping-url", "", "URL for reaching the Auth Service AuthorizeServiceAccountMapping API.")
77-
authSyncNodeURL = pflag.String("auth-sync-node-url", "", "URL for reaching the Auth Service SyncNode API.")
78-
hmsAuthorizeSAMappingURL = pflag.String("hms-authorize-sa-mapping-url", "", "URL for reaching the Hosted Master Service AuthorizeSAMapping API.")
79-
hmsSyncNodeURL = pflag.String("hms-sync-node-url", "", "URL for reaching the Hosted Master Service SyncNode API.")
80-
kubeletReadOnlyCSRApprover = pflag.Bool("kubelet-read-only-csr-approver", false, "Enable kubelet readonly csr approver or not")
81-
autopilotEnabled = pflag.Bool("autopilot", false, "Is this a GKE Autopilot cluster.")
82-
clearStalePodsOnNodeRegistration = pflag.Bool("clearStalePodsOnNodeRegistration", false, "If true, after node registration, delete pods bound to old node.")
65+
port = pflag.Int("port", 8089, "Port to serve status endpoints on (such as /healthz and /metrics).")
66+
metricsPort = pflag.Int("metrics-port", 8089, "Deprecated. Port to expose Prometheus metrics on. If not set, uses the value of --port.")
67+
kubeconfig = pflag.String("kubeconfig", "", "Path to kubeconfig file with authorization and master location information.")
68+
clusterSigningGKEKubeconfig = pflag.String("cluster-signing-gke-kubeconfig", "", "If set, use the kubeconfig file to call GKE to sign cluster-scoped certificates instead of using a local private key.")
69+
gceConfigPath = pflag.String("gce-config", "/etc/gce.conf", "Path to gce.conf.")
70+
controllers = pflag.StringSlice("controllers", []string{"*"}, "Controllers to enable. Possible controllers are: "+strings.Join(loopNames(), ",")+".")
71+
csrApproverVerifyClusterMembership = pflag.Bool("csr-validate-cluster-membership", true, "Validate that VMs requesting CSRs belong to current GKE cluster.")
72+
csrApproverAllowLegacyKubelet = pflag.Bool("csr-allow-legacy-kubelet", true, "Allow legacy kubelet bootstrap flow.")
73+
csrApproverUseGCEInstanceListReferrers = pflag.Bool("csr-use-gce-instance-list-referrers", false, "If true use https://cloud.google.com/compute/docs/reference/rest/v1/instances/listReferrers to validate instance cluster membership.")
74+
csrApproverListReferrersInitialInterval = pflag.Duration("csr-gce-list-referrers-initial-interval", 5*time.Second, "Initial interval of the exponential back-off retries for calls to listReferrers, exponential factor is set to 1.5, defaults to 5s.")
75+
csrApproverListReferrersRetryCount = pflag.Int("csr-gce-list-referrers-retry-count", 10, "Maximal number of retries in exponential back-off for calls to listReferrers, defaults to 10")
76+
gceAPIEndpointOverride = pflag.String("gce-api-endpoint-override", "", "If set, talks to a different GCE API Endpoint. By default it talks to https://www.googleapis.com/compute/v1/projects/")
77+
directPath = pflag.Bool("direct-path", false, "Enable Direct Path.")
78+
authAuthorizeServiceAccountMappingURL = pflag.String("auth-authorize-service-account-mapping-url", "", "URL for reaching the Auth Service AuthorizeServiceAccountMapping API.")
79+
authSyncNodeURL = pflag.String("auth-sync-node-url", "", "URL for reaching the Auth Service SyncNode API.")
80+
hmsAuthorizeSAMappingURL = pflag.String("hms-authorize-sa-mapping-url", "", "URL for reaching the Hosted Master Service AuthorizeSAMapping API.")
81+
hmsSyncNodeURL = pflag.String("hms-sync-node-url", "", "URL for reaching the Hosted Master Service SyncNode API.")
82+
kubeletReadOnlyCSRApprover = pflag.Bool("kubelet-read-only-csr-approver", false, "Enable kubelet readonly csr approver or not")
83+
autopilotEnabled = pflag.Bool("autopilot", false, "Is this a GKE Autopilot cluster.")
84+
clearStalePodsOnNodeRegistration = pflag.Bool("clearStalePodsOnNodeRegistration", false, "If true, after node registration, delete pods bound to old node.")
8385
)
8486

8587
func main() {
@@ -104,22 +106,26 @@ func main() {
104106
logs.InitLogs()
105107

106108
s := &controllerManager{
107-
clusterSigningGKEKubeconfig: *clusterSigningGKEKubeconfig,
108-
gceConfigPath: *gceConfigPath,
109-
gceAPIEndpointOverride: *gceAPIEndpointOverride,
110-
controllers: *controllers,
111-
csrApproverVerifyClusterMembership: *csrApproverVerifyClusterMembership,
112-
csrApproverAllowLegacyKubelet: *csrApproverAllowLegacyKubelet,
113-
csrApproverUseGCEInstanceListReferrers: *csrApproverUseGCEInstanceListReferrers,
114-
leaderElectionConfig: *leConfig,
115-
authAuthorizeServiceAccountMappingURL: *authAuthorizeServiceAccountMappingURL,
116-
authSyncNodeURL: *authSyncNodeURL,
117-
hmsAuthorizeSAMappingURL: *hmsAuthorizeSAMappingURL,
118-
hmsSyncNodeURL: *hmsSyncNodeURL,
119-
healthz: healthz.NewHandler(),
120-
kubeletReadOnlyCSRApprover: *kubeletReadOnlyCSRApprover,
121-
autopilotEnabled: *autopilotEnabled,
122-
clearStalePodsOnNodeRegistration: *clearStalePodsOnNodeRegistration,
109+
clusterSigningGKEKubeconfig: *clusterSigningGKEKubeconfig,
110+
gceConfigPath: *gceConfigPath,
111+
gceAPIEndpointOverride: *gceAPIEndpointOverride,
112+
controllers: *controllers,
113+
csrApproverVerifyClusterMembership: *csrApproverVerifyClusterMembership,
114+
csrApproverAllowLegacyKubelet: *csrApproverAllowLegacyKubelet,
115+
csrApproverListReferrersConfig: gceInstanceListReferrersConfig{
116+
enabled: *csrApproverUseGCEInstanceListReferrers,
117+
initialInterval: *csrApproverListReferrersInitialInterval,
118+
retryCount: *csrApproverListReferrersRetryCount,
119+
},
120+
leaderElectionConfig: *leConfig,
121+
authAuthorizeServiceAccountMappingURL: *authAuthorizeServiceAccountMappingURL,
122+
authSyncNodeURL: *authSyncNodeURL,
123+
hmsAuthorizeSAMappingURL: *hmsAuthorizeSAMappingURL,
124+
hmsSyncNodeURL: *hmsSyncNodeURL,
125+
healthz: healthz.NewHandler(),
126+
kubeletReadOnlyCSRApprover: *kubeletReadOnlyCSRApprover,
127+
autopilotEnabled: *autopilotEnabled,
128+
clearStalePodsOnNodeRegistration: *clearStalePodsOnNodeRegistration,
123129
}
124130
var err error
125131
s.informerKubeconfig, err = clientcmd.BuildConfigFromFlags("", *kubeconfig)
@@ -166,20 +172,20 @@ func main() {
166172
// controllerManager is the main context object for the package.
167173
type controllerManager struct {
168174
// Fields initialized from flags.
169-
clusterSigningGKEKubeconfig string
170-
gceConfigPath string
171-
gceAPIEndpointOverride string
172-
controllers []string
173-
csrApproverVerifyClusterMembership bool
174-
csrApproverAllowLegacyKubelet bool
175-
csrApproverUseGCEInstanceListReferrers bool
176-
leaderElectionConfig componentbaseconfig.LeaderElectionConfiguration
177-
authAuthorizeServiceAccountMappingURL string
178-
authSyncNodeURL string
179-
hmsAuthorizeSAMappingURL string
180-
hmsSyncNodeURL string
181-
autopilotEnabled bool
182-
clearStalePodsOnNodeRegistration bool
175+
clusterSigningGKEKubeconfig string
176+
gceConfigPath string
177+
gceAPIEndpointOverride string
178+
controllers []string
179+
csrApproverVerifyClusterMembership bool
180+
csrApproverAllowLegacyKubelet bool
181+
csrApproverListReferrersConfig gceInstanceListReferrersConfig
182+
leaderElectionConfig componentbaseconfig.LeaderElectionConfiguration
183+
authAuthorizeServiceAccountMappingURL string
184+
authSyncNodeURL string
185+
hmsAuthorizeSAMappingURL string
186+
hmsSyncNodeURL string
187+
autopilotEnabled bool
188+
clearStalePodsOnNodeRegistration bool
183189

184190
// Kubelet Readonly CSR Approver
185191
kubeletReadOnlyCSRApprover bool
@@ -191,6 +197,13 @@ type controllerManager struct {
191197
healthz *healthz.Handler
192198
}
193199

200+
// gceInstanceListReferrersConfig configuration on the ListReferrers retry logic.
201+
type gceInstanceListReferrersConfig struct {
202+
enabled bool
203+
initialInterval time.Duration
204+
retryCount int
205+
}
206+
194207
func (s *controllerManager) isEnabled(name string) bool {
195208
var star bool
196209
for _, controller := range s.controllers {
@@ -240,16 +253,16 @@ func run(s *controllerManager) error {
240253
recorder: eventBroadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{
241254
Component: name,
242255
}),
243-
gcpCfg: s.gcpConfig,
244-
clusterSigningGKEKubeconfig: s.clusterSigningGKEKubeconfig,
245-
csrApproverVerifyClusterMembership: s.csrApproverVerifyClusterMembership,
246-
csrApproverAllowLegacyKubelet: s.csrApproverAllowLegacyKubelet,
247-
csrApproverUseGCEInstanceListReferrers: s.csrApproverUseGCEInstanceListReferrers,
248-
authAuthorizeServiceAccountMappingURL: s.authAuthorizeServiceAccountMappingURL,
249-
authSyncNodeURL: s.authSyncNodeURL,
250-
hmsAuthorizeSAMappingURL: s.hmsAuthorizeSAMappingURL,
251-
hmsSyncNodeURL: s.hmsSyncNodeURL,
252-
clearStalePodsOnNodeRegistration: s.clearStalePodsOnNodeRegistration,
256+
gcpCfg: s.gcpConfig,
257+
clusterSigningGKEKubeconfig: s.clusterSigningGKEKubeconfig,
258+
csrApproverVerifyClusterMembership: s.csrApproverVerifyClusterMembership,
259+
csrApproverAllowLegacyKubelet: s.csrApproverAllowLegacyKubelet,
260+
csrApproverListReferrersConfig: s.csrApproverListReferrersConfig,
261+
authAuthorizeServiceAccountMappingURL: s.authAuthorizeServiceAccountMappingURL,
262+
authSyncNodeURL: s.authSyncNodeURL,
263+
hmsAuthorizeSAMappingURL: s.hmsAuthorizeSAMappingURL,
264+
hmsSyncNodeURL: s.hmsSyncNodeURL,
265+
clearStalePodsOnNodeRegistration: s.clearStalePodsOnNodeRegistration,
253266
}); err != nil {
254267
klog.Fatalf("Failed to start %q: %v", name, err)
255268
}

cmd/gcp-controller-manager/node_csr_approver.go

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
compute "google.golang.org/api/compute/v1"
3838
container "google.golang.org/api/container/v1"
3939
"google.golang.org/api/googleapi"
40+
"k8s.io/apimachinery/pkg/util/wait"
4041

4142
authorization "k8s.io/api/authorization/v1"
4243
capi "k8s.io/api/certificates/v1"
@@ -46,6 +47,7 @@ import (
4647
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
4748
utilerrors "k8s.io/apimachinery/pkg/util/errors"
4849
utilfeature "k8s.io/apiserver/pkg/util/feature"
50+
"k8s.io/apiserver/pkg/util/webhook"
4951
corev1apply "k8s.io/client-go/applyconfigurations/core/v1"
5052
"k8s.io/cloud-provider-gcp/pkg/csrmetrics"
5153
"k8s.io/cloud-provider-gcp/pkg/nodeidentity"
@@ -695,6 +697,43 @@ func validateInstanceGroupHint(instanceGroupUrls []string, instanceGroupHint str
695697
return resolved, nil
696698
}
697699

700+
var errNotFoundListReferrers = errors.New("not found the entry in ListReferrers")
701+
702+
func checkInstanceReferrersBackOff(ctx *controllerContext, instance *compute.Instance, clusterInstanceGroupUrls []string) bool {
703+
if !ctx.csrApproverListReferrersConfig.enabled {
704+
return false
705+
}
706+
klog.Infof("Using compute.InstancesService.ListReferrers to verify cluster membership of instance %q", instance.Name)
707+
708+
var found bool
709+
startTime := time.Now()
710+
backoffPolicy := wait.Backoff{
711+
Duration: ctx.csrApproverListReferrersConfig.initialInterval,
712+
Factor: 1.5,
713+
Jitter: 0.2,
714+
Steps: ctx.csrApproverListReferrersConfig.retryCount,
715+
}
716+
webhook.WithExponentialBackoff(context.TODO(), backoffPolicy, func() error {
717+
var retryErr error
718+
found, retryErr = checkInstanceReferrers(ctx, instance, clusterInstanceGroupUrls)
719+
if retryErr != nil || !found {
720+
return errNotFoundListReferrers
721+
}
722+
return nil
723+
},
724+
func(err error) bool {
725+
return err != nil
726+
},
727+
)
728+
729+
if found {
730+
klog.V(2).Infof("Determined cluster membership of instance %q using compute.InstancesService.ListReferrers after %v", instance.Name, time.Since(startTime))
731+
} else {
732+
klog.Warningf("Could not determine cluster membership of instance %q using compute.InstancesService.ListReferrers after %v; falling back to checking all instance groups", instance.Name, time.Since(startTime))
733+
}
734+
return found
735+
}
736+
698737
func checkInstanceReferrers(ctx *controllerContext, instance *compute.Instance, clusterInstanceGroupUrls []string) (bool, error) {
699738
// instanceZone looks like
700739
// "https://www.googleapis.com/compute/v1/projects/my-project/zones/us-central1-c"
@@ -743,16 +782,9 @@ func clusterHasInstance(ctx *controllerContext, instance *compute.Instance, inst
743782
return false, err
744783
}
745784

746-
if ctx.csrApproverUseGCEInstanceListReferrers {
747-
klog.Infof("using compute.InstancesService.ListReferrers to verify cluster membership of instance %q", instance.Name)
748-
ok, err := checkInstanceReferrers(ctx, instance, clusterInstanceGroupUrls)
749-
if err != nil {
750-
return false, err
751-
}
752-
if ok {
753-
return true, nil
754-
}
755-
klog.Warningf("could not determine cluster membership of instance %q using compute.InstancesService.ListReferrers; falling back to checking all instance groups", instance.Name)
785+
ok := checkInstanceReferrersBackOff(ctx, instance, clusterInstanceGroupUrls)
786+
if ok {
787+
return true, nil
756788
}
757789

758790
validatedInstanceGroupHint, err := validateInstanceGroupHint(clusterInstanceGroupUrls, instanceGroupHint)

0 commit comments

Comments
 (0)