diff --git a/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml new file mode 100644 index 0000000000..a57476baa6 --- /dev/null +++ b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml @@ -0,0 +1,34 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Clear agent.upgrade_attempts when upgrade is complete. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: | + The new AutomaticAgentUpgradeTask Kibana task sets the upgrade_attempts property in agents it upgrades. + This property is used to track upgrade retries and should therefore be cleared when the upgrade is complete. + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: fleet-server + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/fleet-server/pull/4528 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +# issue: diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index 38cd78836a..6bbe9f877e 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -219,7 +219,7 @@ func (ct *CheckinT) validateRequest(zlog zerolog.Logger, w http.ResponseWriter, } wTime := pollDuration + time.Minute - rc := http.NewResponseController(w) //nolint:bodyclose // we are working with a ResponseWriter not a Respons + rc := http.NewResponseController(w) if err := rc.SetWriteDeadline(start.Add(wTime)); err != nil { zlog.Warn().Err(err).Time("write_deadline", start.Add(wTime)).Msg("Unable to set checkin write deadline.") } else { @@ -398,6 +398,26 @@ func (ct *CheckinT) ProcessRequest(zlog zerolog.Logger, w http.ResponseWriter, r return ct.writeResponse(zlog, w, r, agent, resp) } +func (ct *CheckinT) verifyActionExists(vCtx context.Context, vSpan *apm.Span, agent *model.Agent, details *UpgradeDetails) (*model.Action, error) { + action, ok := ct.cache.GetAction(details.ActionId) + if !ok { + actions, err := dl.FindAction(vCtx, ct.bulker, details.ActionId) + if err != nil { + vSpan.End() + return nil, fmt.Errorf("unable to find upgrade_details action: %w", err) + } + if len(actions) == 0 { + vSpan.End() + zerolog.Ctx(vCtx).Warn().Msgf("upgrade_details no action for id %q found (agent id %q)", details.ActionId, agent.Agent.ID) + return nil, nil + } + action = actions[0] + ct.cache.SetAction(action) + } + vSpan.End() + return &action, nil +} + // processUpgradeDetails will verify and set the upgrade_details section of an agent document based on checkin value. // if the agent doc and checkin details are both nil the method is a nop // if the checkin upgrade_details is nil but there was a previous value in the agent doc, fleet-server treats it as a successful upgrade @@ -412,24 +432,14 @@ func (ct *CheckinT) processUpgradeDetails(ctx context.Context, agent *model.Agen } // update docs with in progress details - // verify action exists vSpan, vCtx := apm.StartSpan(ctx, "Check update action", "validate") - action, ok := ct.cache.GetAction(details.ActionId) - if !ok { - actions, err := dl.FindAction(vCtx, ct.bulker, details.ActionId) - if err != nil { - vSpan.End() - return fmt.Errorf("unable to find upgrade_details action: %w", err) - } - if len(actions) == 0 { - vSpan.End() - zerolog.Ctx(vCtx).Warn().Msgf("upgrade_details no action for id %q found (agent id %q)", details.ActionId, agent.Agent.ID) - return nil - } - action = actions[0] - ct.cache.SetAction(action) + action, err := ct.verifyActionExists(ctx, vSpan, agent, details) + if err != nil { + return err + } + if action == nil { + return nil } - vSpan.End() // link action with APM spans var links []apm.SpanLink @@ -500,6 +510,10 @@ func (ct *CheckinT) processUpgradeDetails(ctx context.Context, agent *model.Agen doc := bulk.UpdateFields{ dl.FieldUpgradeDetails: details, } + if agent.UpgradeAttempts != nil && details.State == UpgradeDetailsStateUPGWATCHING { + doc[dl.FieldUpgradeAttempts] = nil + } + body, err := doc.Marshal() if err != nil { return err diff --git a/internal/pkg/api/handleCheckin_test.go b/internal/pkg/api/handleCheckin_test.go index cc5652f2da..da0e52d4fa 100644 --- a/internal/pkg/api/handleCheckin_test.go +++ b/internal/pkg/api/handleCheckin_test.go @@ -17,6 +17,7 @@ import ( "testing" "time" + "github.com/elastic/fleet-server/v7/internal/pkg/bulk" "github.com/elastic/fleet-server/v7/internal/pkg/cache" "github.com/elastic/fleet-server/v7/internal/pkg/checkin" "github.com/elastic/fleet-server/v7/internal/pkg/config" @@ -312,6 +313,17 @@ func TestResolveSeqNo(t *testing.T) { func TestProcessUpgradeDetails(t *testing.T) { esd := model.ESDocument{Id: "doc-ID"} + doc := bulk.UpdateFields{ + dl.FieldUpgradeDetails: &UpgradeDetails{ + ActionId: "test-action", + State: UpgradeDetailsStateUPGWATCHING, + }, + dl.FieldUpgradeAttempts: nil, + } + body, err := doc.Marshal() + if err != nil { + t.Fatalf("marshal error: %v", err) + } tests := []struct { name string agent *model.Agent @@ -625,6 +637,24 @@ func TestProcessUpgradeDetails(t *testing.T) { return mCache }, err: ErrInvalidUpgradeMetadata, + }, { + name: "clear upgrade attempts when watching", + agent: &model.Agent{ESDocument: esd, Agent: &model.AgentMetadata{ID: "test-agent"}, UpgradeAttempts: make([]string, 0)}, + details: &UpgradeDetails{ + ActionId: "test-action", + State: UpgradeDetailsStateUPGWATCHING, + }, + bulk: func() *ftesting.MockBulk { + mBulk := ftesting.NewMockBulk() + mBulk.On("Update", mock.Anything, dl.FleetAgents, "doc-ID", body, mock.Anything, mock.Anything).Return(nil) + return mBulk + }, + cache: func() *testcache.MockCache { + mCache := testcache.NewMockCache() + mCache.On("GetAction", "test-action").Return(model.Action{}, true) + return mCache + }, + err: nil, }} for _, tc := range tests { diff --git a/internal/pkg/dl/constants.go b/internal/pkg/dl/constants.go index 73d382789f..9c191955aa 100644 --- a/internal/pkg/dl/constants.go +++ b/internal/pkg/dl/constants.go @@ -54,6 +54,7 @@ const ( FieldUpgradeStartedAt = "upgrade_started_at" FieldUpgradeStatus = "upgrade_status" FieldUpgradeDetails = "upgrade_details" + FieldUpgradeAttempts = "upgrade_attempts" FieldAuditUnenrolledTime = "audit_unenrolled_time" FieldAuditUnenrolledReason = "audit_unenrolled_reason" diff --git a/internal/pkg/model/schema.go b/internal/pkg/model/schema.go index 7991bd3aac..11ffac6a94 100644 --- a/internal/pkg/model/schema.go +++ b/internal/pkg/model/schema.go @@ -216,6 +216,9 @@ type Agent struct { // Date/time the Elastic Agent was last updated UpdatedAt string `json:"updated_at,omitempty"` + // List of timestamps of attempts of Elastic Agent automatic upgrades + UpgradeAttempts []string `json:"upgrade_attempts,omitempty"` + // Additional upgrade status details. UpgradeDetails *UpgradeDetails `json:"upgrade_details,omitempty"` diff --git a/model/schema.json b/model/schema.json index 6fb03c87cb..8e19b7b7f9 100644 --- a/model/schema.json +++ b/model/schema.json @@ -686,6 +686,13 @@ "description": "Additional upgrade status details.", "type": "object" }, + "upgrade_attempts": { + "description": "List of timestamps of attempts of Elastic Agent automatic upgrades", + "type": "array", + "items": { + "type": "string" + } + }, "replace_token": { "description": "hash of token provided during enrollment that allows replacement by another enrollment with same ID", "type": "string"