Skip to content

Commit b201e16

Browse files
Retry enrollment requests when an error is returned, add enrollment timeout (elastic#8056)
Retry enrollment requests when an error is returned until a timeout is reached. Add --enroll-timeout and FLEET_ENROLL_TIMEOUT to control how long the timeout is; default 10m. A negative value disables the timeout.
1 parent 47c9f91 commit b201e16

File tree

7 files changed

+189
-67
lines changed

7 files changed

+189
-67
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: enhancement
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Retry enrollment requests on any error
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
description: |
20+
If any error is encountered during an attempted enrollment, the elastic-agent
21+
will backoff and retry. Add a new --enroll-timeout flag and
22+
FLEET_ENROLL_TIMEOUT env var to set how long it tries for, default 10m. A
23+
negative value disables the timeout.
24+
25+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
26+
component: elastic-agent
27+
28+
# PR URL; optional; the PR number that added the changeset.
29+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
30+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
31+
# Please provide it if you are adding a fragment for a different PR.
32+
pr: https://github.com/elastic/elastic-agent/pull/8056
33+
34+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
35+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
36+
#issue: https://github.com/owner/repo/1234

internal/pkg/agent/cmd/container.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ The following actions are possible and grouped based on the actions.
8181
FLEET_ENROLL - set to 1 for enrollment into Fleet Server. If not set, Elastic Agent is run in standalone mode.
8282
FLEET_URL - URL of the Fleet Server to enroll into
8383
FLEET_ENROLLMENT_TOKEN - token to use for enrollment. This is not needed in case FLEET_SERVER_ENABLED and FLEET_ENROLL is set. Then the token is fetched from Kibana.
84+
FLEET_ENROLL_TIMEOUT - The timeout duration for the enroll commnd. Defaults to 10m. A negative value disables the timeout.
8485
FLEET_CA - path to certificate authority to use with communicate with Fleet Server [$KIBANA_CA]
8586
FLEET_INSECURE - communicate with Fleet with either insecure HTTP or unverified HTTPS
8687
ELASTIC_AGENT_CERT - path to certificate to use for connecting to fleet-server.
@@ -390,7 +391,7 @@ func ensureServiceToken(streams *cli.IOStreams, cfg *setupConfig) error {
390391
if err != nil {
391392
return err
392393
}
393-
code, r, err := client.Connection.Request("POST", "/api/fleet/service_tokens", nil, nil, nil)
394+
code, r, err := client.Request("POST", "/api/fleet/service_tokens", nil, nil, nil)
394395
if err != nil {
395396
return fmt.Errorf("request to get security token from Kibana failed: %w", err)
396397
}
@@ -517,6 +518,10 @@ func buildEnrollArgs(cfg setupConfig, token string, policyID string) ([]string,
517518
args = append(args, "--daemon-timeout")
518519
args = append(args, cfg.Fleet.DaemonTimeout.String())
519520
}
521+
if cfg.Fleet.EnrollTimeout != 0 {
522+
args = append(args, "--enroll-timeout")
523+
args = append(args, cfg.Fleet.EnrollTimeout.String())
524+
}
520525
if cfg.Fleet.Cert != "" {
521526
args = append(args, "--elastic-agent-cert", cfg.Fleet.Cert)
522527
}
@@ -693,14 +698,14 @@ func isTrue(val string) bool {
693698
func performGET(cfg setupConfig, client *kibana.Client, path string, response interface{}, writer io.Writer, msg string) error {
694699
var lastErr error
695700
for i := 0; i < cfg.Kibana.RetryMaxCount; i++ {
696-
code, result, err := client.Connection.Request("GET", path, nil, nil, nil)
701+
code, result, err := client.Request("GET", path, nil, nil, nil)
697702
if err != nil || code != 200 {
698703
if err != nil {
699704
err = fmt.Errorf("http GET request to %s%s fails: %w. Response: %s",
700-
client.Connection.URL, path, err, truncateString(result))
705+
client.URL, path, err, truncateString(result))
701706
} else {
702707
err = fmt.Errorf("http GET request to %s%s fails. StatusCode: %d Response: %s",
703-
client.Connection.URL, path, code, truncateString(result))
708+
client.URL, path, code, truncateString(result))
704709
}
705710
fmt.Fprintf(writer, "%s failed: %s\n", msg, err)
706711
<-time.After(cfg.Kibana.RetrySleepDuration)
@@ -721,7 +726,7 @@ func truncateString(b []byte) string {
721726
runes = append(runes[:maxLength], []rune("... (truncated)")...)
722727
}
723728

724-
return strings.Replace(string(runes), "\n", " ", -1)
729+
return strings.ReplaceAll(string(runes), "\n", " ")
725730
}
726731

727732
// runLegacyAPMServer extracts the bundled apm-server from elastic-agent

internal/pkg/agent/cmd/enroll.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"strconv"
1515
"strings"
1616
"syscall"
17+
"time"
1718

1819
"github.com/spf13/cobra"
1920

@@ -95,6 +96,7 @@ func addEnrollFlags(cmd *cobra.Command) {
9596
cmd.Flags().StringSliceP("proxy-header", "", []string{}, "Proxy headers used with CONNECT request: when bootstrapping Fleet Server, it's the proxy used by Fleet Server to connect to Elasticsearch; when enrolling the Elastic Agent to Fleet Server, it's the proxy used by the Elastic Agent to connect to Fleet Server")
9697
cmd.Flags().BoolP("delay-enroll", "", false, "Delays enrollment to occur on first start of the Elastic Agent service")
9798
cmd.Flags().DurationP("daemon-timeout", "", 0, "Timeout waiting for Elastic Agent daemon")
99+
cmd.Flags().DurationP("enroll-timeout", "", 10*time.Minute, "Timeout waiting for Elastic Agent enroll command. A negative value disables the timeout.")
98100
cmd.Flags().DurationP("fleet-server-timeout", "", 0, "When bootstrapping Fleet Server, timeout waiting for Fleet Server to be ready to start enrollment")
99101
cmd.Flags().Bool("skip-daemon-reload", false, "Skip daemon reload after enrolling")
100102
cmd.Flags().StringSliceP("tag", "", []string{}, "User-set tags")
@@ -205,6 +207,7 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string
205207
fProxyHeaders, _ := cmd.Flags().GetStringSlice("proxy-header")
206208
delayEnroll, _ := cmd.Flags().GetBool("delay-enroll")
207209
daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout")
210+
enrollTimeout, _ := cmd.Flags().GetDuration("enroll-timeout")
208211
fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout")
209212
skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload")
210213
fTags, _ := cmd.Flags().GetStringSlice("tag")
@@ -285,6 +288,10 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string
285288
args = append(args, "--daemon-timeout")
286289
args = append(args, daemonTimeout.String())
287290
}
291+
if enrollTimeout != 0 {
292+
args = append(args, "--enroll-timeout")
293+
args = append(args, enrollTimeout.String())
294+
}
288295
if fTimeout != 0 {
289296
args = append(args, "--fleet-server-timeout")
290297
args = append(args, fTimeout.String())
@@ -461,6 +468,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
461468
proxyHeaders, _ := cmd.Flags().GetStringSlice("proxy-header")
462469
delayEnroll, _ := cmd.Flags().GetBool("delay-enroll")
463470
daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout")
471+
enrollTimeout, _ := cmd.Flags().GetDuration("enroll-timeout")
464472
fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout")
465473
skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload")
466474
tags, _ := cmd.Flags().GetStringSlice("tag")
@@ -475,6 +483,12 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
475483

476484
ctx := handleSignal(context.Background())
477485

486+
if enrollTimeout > 0 {
487+
eCtx, cancel := context.WithTimeout(ctx, enrollTimeout)
488+
defer cancel()
489+
ctx = eCtx
490+
}
491+
478492
// On MacOS Ventura and above, fixing the permissions on enrollment during installation fails with the error:
479493
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
480494
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.

internal/pkg/agent/cmd/enroll_cmd.go

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -543,25 +543,23 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
543543
defer close(signal)
544544
backExp := c.backoffFactory(signal)
545545

546+
RETRYLOOP:
546547
for {
547-
retry := false
548548
switch {
549549
case errors.Is(err, fleetapi.ErrTooManyRequests):
550550
c.log.Warn("Too many requests on the remote server, will retry in a moment.")
551-
retry = true
552551
case errors.Is(err, fleetapi.ErrConnRefused):
553552
c.log.Warn("Remote server is not ready to accept connections(Connection Refused), will retry in a moment.")
554-
retry = true
555553
case errors.Is(err, fleetapi.ErrTemporaryServerError):
556554
c.log.Warnf("Remote server failed to handle the request(%s), will retry in a moment.", err.Error())
557-
retry = true
555+
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded), err == nil:
556+
break RETRYLOOP
558557
case err != nil:
559-
c.log.Warnf("Enrollment failed: %s", err.Error())
558+
c.log.Warnf("Error detected: %s, will retry in a moment.", err.Error())
560559
}
561-
if !retry {
562-
break
560+
if !backExp.Wait() {
561+
break RETRYLOOP
563562
}
564-
backExp.Wait()
565563
c.log.Infof("Retrying enrollment to URL: %s", c.client.URI())
566564
err = c.enroll(ctx, persistentConfig)
567565
}
@@ -597,9 +595,7 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
597595

598596
resp, err := cmd.Execute(ctx, r)
599597
if err != nil {
600-
return errors.New(err,
601-
"fail to execute request to fleet-server",
602-
errors.TypeNetwork)
598+
return fmt.Errorf("failed to execute request to fleet-server: %w", err)
603599
}
604600

605601
fleetConfig, err := createFleetConfigFromEnroll(resp.Item.AccessAPIKey, c.options.EnrollAPIKey, c.options.ReplaceToken, c.remoteConfig)

0 commit comments

Comments
 (0)