Skip to content

Commit aa74064

Browse files
authored
Merge pull request kubernetes#75217 from deads2k/agg-parallel
for aggregated apiserver availability, try multiple endpoints in parallel
2 parents a2c200a + 0f7185e commit aa74064

File tree

1 file changed

+44
-21
lines changed

1 file changed

+44
-21
lines changed

staging/src/k8s.io/kube-aggregator/pkg/controllers/status/available_controller.go

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -242,41 +242,64 @@ func (c *AvailableConditionController) sync(key string) error {
242242
}
243243
// actually try to hit the discovery endpoint when it isn't local and when we're routing as a service.
244244
if apiService.Spec.Service != nil && c.serviceResolver != nil {
245-
discoveryURL, err := c.serviceResolver.ResolveEndpoint(apiService.Spec.Service.Namespace, apiService.Spec.Service.Name, apiService.Spec.Service.Port)
246-
if err != nil {
247-
return err
248-
}
245+
attempts := 5
246+
results := make(chan error, attempts)
247+
for i := 0; i < attempts; i++ {
248+
go func() {
249+
discoveryURL, err := c.serviceResolver.ResolveEndpoint(apiService.Spec.Service.Namespace, apiService.Spec.Service.Name, apiService.Spec.Service.Port)
250+
if err != nil {
251+
results <- err
252+
return
253+
}
249254

250-
errCh := make(chan error)
251-
go func() {
252-
resp, err := c.discoveryClient.Get(discoveryURL.String())
253-
if resp != nil {
254-
resp.Body.Close()
255-
}
256-
errCh <- err
257-
}()
255+
errCh := make(chan error)
256+
go func() {
257+
resp, err := c.discoveryClient.Get(discoveryURL.String())
258+
if resp != nil {
259+
resp.Body.Close()
260+
}
261+
errCh <- err
262+
}()
263+
264+
select {
265+
case err = <-errCh:
266+
if err != nil {
267+
results <- fmt.Errorf("no response from %v: %v", discoveryURL, err)
268+
return
269+
}
270+
271+
// we had trouble with slow dial and DNS responses causing us to wait too long.
272+
// we added this as insurance
273+
case <-time.After(6 * time.Second):
274+
results <- fmt.Errorf("timed out waiting for %v", discoveryURL)
275+
return
276+
}
258277

259-
select {
260-
case err = <-errCh:
278+
results <- nil
279+
}()
280+
}
261281

262-
// we had trouble with slow dial and DNS responses causing us to wait too long.
263-
// we added this as insurance
264-
case <-time.After(6 * time.Second):
265-
err = fmt.Errorf("timed out waiting for %v", discoveryURL)
282+
var lastError error
283+
for i := 0; i < attempts; i++ {
284+
lastError = <-results
285+
// if we had at least one success, we are successful overall and we can return now
286+
if lastError == nil {
287+
break
288+
}
266289
}
267290

268-
if err != nil {
291+
if lastError != nil {
269292
availableCondition.Status = apiregistration.ConditionFalse
270293
availableCondition.Reason = "FailedDiscoveryCheck"
271-
availableCondition.Message = fmt.Sprintf("no response from %v: %v", discoveryURL, err)
294+
availableCondition.Message = lastError.Error()
272295
apiregistration.SetAPIServiceCondition(apiService, availableCondition)
273296
_, updateErr := updateAPIServiceStatus(c.apiServiceClient, originalAPIService, apiService)
274297
if updateErr != nil {
275298
return updateErr
276299
}
277300
// force a requeue to make it very obvious that this will be retried at some point in the future
278301
// along with other requeues done via service change, endpoint change, and resync
279-
return err
302+
return lastError
280303
}
281304
}
282305

0 commit comments

Comments
 (0)