From 1846685954ae2374e956f9625d9f9b3d39473dd6 Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Fri, 28 Feb 2025 12:45:45 +0100 Subject: [PATCH 01/14] Clear agent.upgrade_attemps on upgrade complete --- internal/pkg/api/handleCheckin.go | 1 + internal/pkg/dl/constants.go | 1 + 2 files changed, 2 insertions(+) diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index 38cd78836a..5f91515e42 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -521,6 +521,7 @@ func (ct *CheckinT) markUpgradeComplete(ctx context.Context, agent *model.Agent) dl.FieldUpgradeStartedAt: nil, dl.FieldUpgradeStatus: nil, dl.FieldUpgradedAt: time.Now().UTC().Format(time.RFC3339), + dl.FieldUpgradeAttempts: nil, } body, err := doc.Marshal() if err != nil { diff --git a/internal/pkg/dl/constants.go b/internal/pkg/dl/constants.go index 73d382789f..9c191955aa 100644 --- a/internal/pkg/dl/constants.go +++ b/internal/pkg/dl/constants.go @@ -54,6 +54,7 @@ const ( FieldUpgradeStartedAt = "upgrade_started_at" FieldUpgradeStatus = "upgrade_status" FieldUpgradeDetails = "upgrade_details" + FieldUpgradeAttempts = "upgrade_attempts" FieldAuditUnenrolledTime = "audit_unenrolled_time" FieldAuditUnenrolledReason = "audit_unenrolled_reason" From 0bc402dcb8c41af44c27e4d9e6e38cab32919677 Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Fri, 28 Feb 2025 16:28:27 +0100 Subject: [PATCH 02/14] This actually works --- internal/pkg/api/handleAck.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/pkg/api/handleAck.go b/internal/pkg/api/handleAck.go index 0e872c40f1..2e947ae6e9 100644 --- a/internal/pkg/api/handleAck.go +++ b/internal/pkg/api/handleAck.go @@ -619,6 +619,7 @@ func (ack *AckT) handleUpgrade(ctx context.Context, zlog zerolog.Logger, agent * dl.FieldUpgradeStartedAt: nil, dl.FieldUpgradeStatus: nil, dl.FieldUpgradedAt: now, + dl.FieldUpgradeAttempts: nil, } } From 18937753e1a887d0a7f58164c2daa6cff0a4136c Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Fri, 28 Feb 2025 16:49:14 +0100 Subject: [PATCH 03/14] Silence nolintlint error in handleCheckin.go --- internal/pkg/api/handleCheckin.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index 5f91515e42..bfa2666ecc 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -219,7 +219,7 @@ func (ct *CheckinT) validateRequest(zlog zerolog.Logger, w http.ResponseWriter, } wTime := pollDuration + time.Minute - rc := http.NewResponseController(w) //nolint:bodyclose // we are working with a ResponseWriter not a Respons + rc := http.NewResponseController(w) //nolint:nolintlint,bodyclose // we are working with a ResponseWriter not a Respons if err := rc.SetWriteDeadline(start.Add(wTime)); err != nil { zlog.Warn().Err(err).Time("write_deadline", start.Add(wTime)).Msg("Unable to set checkin write deadline.") } else { From d196d606c0ceea10d86ced392eb38606bd63123b Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Fri, 28 Feb 2025 16:55:04 +0100 Subject: [PATCH 04/14] Remove nolint comment altogether --- internal/pkg/api/handleCheckin.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index bfa2666ecc..672663a6f9 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -219,7 +219,7 @@ func (ct *CheckinT) validateRequest(zlog zerolog.Logger, w http.ResponseWriter, } wTime := pollDuration + time.Minute - rc := http.NewResponseController(w) //nolint:nolintlint,bodyclose // we are working with a ResponseWriter not a Respons + rc := http.NewResponseController(w) if err := rc.SetWriteDeadline(start.Add(wTime)); err != nil { zlog.Warn().Err(err).Time("write_deadline", start.Add(wTime)).Msg("Unable to set checkin write deadline.") } else { From 7eb37ffe8c543971e69996c85dd059c955496623 Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Fri, 28 Feb 2025 17:39:58 +0100 Subject: [PATCH 05/14] Add changelog --- ...pgrade-attempts-when-upgrade-complete.yaml | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml diff --git a/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml new file mode 100644 index 0000000000..9383499325 --- /dev/null +++ b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml @@ -0,0 +1,34 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: feature + +# Change summary; a 80ish characters long description of the change. +summary: Clear agent.upgrade_attempts when upgrade is complete. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: | + The new AutomaticAgentUpgradeTask Kibana task sets the upgrade_attempts property in agents it upgrades. + This property is used to track upgrade retries and should therefore be cleared when the upgrade is complete. + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: fleet-server + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/fleet-server/pull/4528 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/ingest-dev/issues/4720 From 68f738871a59bd80083f8db4fd02b31af7eeefbd Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Tue, 4 Mar 2025 10:41:05 +0100 Subject: [PATCH 06/14] Update handleCheckin unit test --- internal/pkg/api/handleCheckin_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/api/handleCheckin_test.go b/internal/pkg/api/handleCheckin_test.go index cc5652f2da..644a0f5e66 100644 --- a/internal/pkg/api/handleCheckin_test.go +++ b/internal/pkg/api/handleCheckin_test.go @@ -344,7 +344,7 @@ func TestProcessUpgradeDetails(t *testing.T) { t.Logf("bulk match unmarshal error: %v", err) return false } - return doc.Doc[dl.FieldUpgradeDetails] == nil && doc.Doc[dl.FieldUpgradeStartedAt] == nil && doc.Doc[dl.FieldUpgradeStatus] == nil && doc.Doc[dl.FieldUpgradedAt] != "" + return doc.Doc[dl.FieldUpgradeDetails] == nil && doc.Doc[dl.FieldUpgradeStartedAt] == nil && doc.Doc[dl.FieldUpgradeStatus] == nil && doc.Doc[dl.FieldUpgradedAt] != "" && doc.Doc[dl.FieldUpgradeAttempts] == nil }), mock.Anything, mock.Anything).Return(nil) return mBulk }, From 8d7ce7d20612b124816225477816ba00ae275f4d Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Wed, 5 Mar 2025 18:57:21 +0100 Subject: [PATCH 07/14] Change approach --- ...9-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml | 2 +- internal/pkg/api/handleAck.go | 1 - internal/pkg/api/handleCheckin.go | 5 ++++- internal/pkg/model/schema.go | 3 +++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml index 9383499325..23acf6ed8b 100644 --- a/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml +++ b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml @@ -8,7 +8,7 @@ # - security: impacts on the security of a product or a user’s deployment. # - upgrade: important information for someone upgrading from a prior version # - other: does not fit into any of the other categories -kind: feature +kind: enhancement # Change summary; a 80ish characters long description of the change. summary: Clear agent.upgrade_attempts when upgrade is complete. diff --git a/internal/pkg/api/handleAck.go b/internal/pkg/api/handleAck.go index 2e947ae6e9..0e872c40f1 100644 --- a/internal/pkg/api/handleAck.go +++ b/internal/pkg/api/handleAck.go @@ -619,7 +619,6 @@ func (ack *AckT) handleUpgrade(ctx context.Context, zlog zerolog.Logger, agent * dl.FieldUpgradeStartedAt: nil, dl.FieldUpgradeStatus: nil, dl.FieldUpgradedAt: now, - dl.FieldUpgradeAttempts: nil, } } diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index 672663a6f9..82a8d45b9f 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -500,6 +500,10 @@ func (ct *CheckinT) processUpgradeDetails(ctx context.Context, agent *model.Agen doc := bulk.UpdateFields{ dl.FieldUpgradeDetails: details, } + if agent.UpgradeAttempts != nil && details.State == UpgradeDetailsStateUPGWATCHING { + doc[dl.FieldUpgradeAttempts] = nil + } + body, err := doc.Marshal() if err != nil { return err @@ -521,7 +525,6 @@ func (ct *CheckinT) markUpgradeComplete(ctx context.Context, agent *model.Agent) dl.FieldUpgradeStartedAt: nil, dl.FieldUpgradeStatus: nil, dl.FieldUpgradedAt: time.Now().UTC().Format(time.RFC3339), - dl.FieldUpgradeAttempts: nil, } body, err := doc.Marshal() if err != nil { diff --git a/internal/pkg/model/schema.go b/internal/pkg/model/schema.go index 7991bd3aac..ee7facf117 100644 --- a/internal/pkg/model/schema.go +++ b/internal/pkg/model/schema.go @@ -228,6 +228,9 @@ type Agent struct { // Date/time the Elastic Agent was last upgraded UpgradedAt string `json:"upgraded_at,omitempty"` + // List of timestamps of attempts of Elastic Agent automatic upgrades + UpgradeAttempts []string `json:"upgrade_attempts,omitempty"` + // User provided metadata information for the Elastic Agent UserProvidedMetadata json.RawMessage `json:"user_provided_metadata,omitempty"` } From 062d12eb03b0eafdc3179a0d55577311ec502337 Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Wed, 5 Mar 2025 19:00:50 +0100 Subject: [PATCH 08/14] Revert unit test change --- internal/pkg/api/handleCheckin_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/api/handleCheckin_test.go b/internal/pkg/api/handleCheckin_test.go index 644a0f5e66..cc5652f2da 100644 --- a/internal/pkg/api/handleCheckin_test.go +++ b/internal/pkg/api/handleCheckin_test.go @@ -344,7 +344,7 @@ func TestProcessUpgradeDetails(t *testing.T) { t.Logf("bulk match unmarshal error: %v", err) return false } - return doc.Doc[dl.FieldUpgradeDetails] == nil && doc.Doc[dl.FieldUpgradeStartedAt] == nil && doc.Doc[dl.FieldUpgradeStatus] == nil && doc.Doc[dl.FieldUpgradedAt] != "" && doc.Doc[dl.FieldUpgradeAttempts] == nil + return doc.Doc[dl.FieldUpgradeDetails] == nil && doc.Doc[dl.FieldUpgradeStartedAt] == nil && doc.Doc[dl.FieldUpgradeStatus] == nil && doc.Doc[dl.FieldUpgradedAt] != "" }), mock.Anything, mock.Anything).Return(nil) return mBulk }, From f5eb3dc9c78e9a4450b6ee03150dacd9f3a16799 Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Thu, 6 Mar 2025 11:47:29 +0100 Subject: [PATCH 09/14] This seems needed --- model/schema.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/model/schema.json b/model/schema.json index 6fb03c87cb..8e19b7b7f9 100644 --- a/model/schema.json +++ b/model/schema.json @@ -686,6 +686,13 @@ "description": "Additional upgrade status details.", "type": "object" }, + "upgrade_attempts": { + "description": "List of timestamps of attempts of Elastic Agent automatic upgrades", + "type": "array", + "items": { + "type": "string" + } + }, "replace_token": { "description": "hash of token provided during enrollment that allows replacement by another enrollment with same ID", "type": "string" From 33acbf4557201ea7a05f7010f0cc91f634a46f5f Mon Sep 17 00:00:00 2001 From: Jill Guyonnet Date: Thu, 6 Mar 2025 19:12:23 +0100 Subject: [PATCH 10/14] Run make generate --- internal/pkg/model/schema.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/pkg/model/schema.go b/internal/pkg/model/schema.go index ee7facf117..11ffac6a94 100644 --- a/internal/pkg/model/schema.go +++ b/internal/pkg/model/schema.go @@ -216,6 +216,9 @@ type Agent struct { // Date/time the Elastic Agent was last updated UpdatedAt string `json:"updated_at,omitempty"` + // List of timestamps of attempts of Elastic Agent automatic upgrades + UpgradeAttempts []string `json:"upgrade_attempts,omitempty"` + // Additional upgrade status details. UpgradeDetails *UpgradeDetails `json:"upgrade_details,omitempty"` @@ -228,9 +231,6 @@ type Agent struct { // Date/time the Elastic Agent was last upgraded UpgradedAt string `json:"upgraded_at,omitempty"` - // List of timestamps of attempts of Elastic Agent automatic upgrades - UpgradeAttempts []string `json:"upgrade_attempts,omitempty"` - // User provided metadata information for the Elastic Agent UserProvidedMetadata json.RawMessage `json:"user_provided_metadata,omitempty"` } From 74649e36f3add52c278c45ad77a7569a5aa02b51 Mon Sep 17 00:00:00 2001 From: Julia Bardi <90178898+juliaElastic@users.noreply.github.com> Date: Fri, 7 Mar 2025 15:10:02 +0100 Subject: [PATCH 11/14] Remove internal link --- ...0469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml index 23acf6ed8b..a57476baa6 100644 --- a/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml +++ b/changelog/fragments/1740760469-Clear-agent-upgrade-attempts-when-upgrade-complete.yaml @@ -31,4 +31,4 @@ pr: https://github.com/elastic/fleet-server/pull/4528 # Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). # If not present is automatically filled by the tooling with the issue linked to the PR number. -issue: https://github.com/elastic/ingest-dev/issues/4720 +# issue: From 56426c20d773fdfbace34d27af641e81409222c2 Mon Sep 17 00:00:00 2001 From: Julia Bardi Date: Mon, 10 Mar 2025 10:38:31 +0100 Subject: [PATCH 12/14] add unit test --- internal/pkg/api/handleCheckin_test.go | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/internal/pkg/api/handleCheckin_test.go b/internal/pkg/api/handleCheckin_test.go index cc5652f2da..da0e52d4fa 100644 --- a/internal/pkg/api/handleCheckin_test.go +++ b/internal/pkg/api/handleCheckin_test.go @@ -17,6 +17,7 @@ import ( "testing" "time" + "github.com/elastic/fleet-server/v7/internal/pkg/bulk" "github.com/elastic/fleet-server/v7/internal/pkg/cache" "github.com/elastic/fleet-server/v7/internal/pkg/checkin" "github.com/elastic/fleet-server/v7/internal/pkg/config" @@ -312,6 +313,17 @@ func TestResolveSeqNo(t *testing.T) { func TestProcessUpgradeDetails(t *testing.T) { esd := model.ESDocument{Id: "doc-ID"} + doc := bulk.UpdateFields{ + dl.FieldUpgradeDetails: &UpgradeDetails{ + ActionId: "test-action", + State: UpgradeDetailsStateUPGWATCHING, + }, + dl.FieldUpgradeAttempts: nil, + } + body, err := doc.Marshal() + if err != nil { + t.Fatalf("marshal error: %v", err) + } tests := []struct { name string agent *model.Agent @@ -625,6 +637,24 @@ func TestProcessUpgradeDetails(t *testing.T) { return mCache }, err: ErrInvalidUpgradeMetadata, + }, { + name: "clear upgrade attempts when watching", + agent: &model.Agent{ESDocument: esd, Agent: &model.AgentMetadata{ID: "test-agent"}, UpgradeAttempts: make([]string, 0)}, + details: &UpgradeDetails{ + ActionId: "test-action", + State: UpgradeDetailsStateUPGWATCHING, + }, + bulk: func() *ftesting.MockBulk { + mBulk := ftesting.NewMockBulk() + mBulk.On("Update", mock.Anything, dl.FleetAgents, "doc-ID", body, mock.Anything, mock.Anything).Return(nil) + return mBulk + }, + cache: func() *testcache.MockCache { + mCache := testcache.NewMockCache() + mCache.On("GetAction", "test-action").Return(model.Action{}, true) + return mCache + }, + err: nil, }} for _, tc := range tests { From 269019ebc71b549e488908252676edf6245e1687 Mon Sep 17 00:00:00 2001 From: Julia Bardi Date: Mon, 10 Mar 2025 11:33:14 +0100 Subject: [PATCH 13/14] reduce complexity --- internal/pkg/api/handleCheckin.go | 39 ++++++++++++++++++------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index 82a8d45b9f..11a6bfb4e9 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -398,6 +398,26 @@ func (ct *CheckinT) ProcessRequest(zlog zerolog.Logger, w http.ResponseWriter, r return ct.writeResponse(zlog, w, r, agent, resp) } +func (ct *CheckinT) verifyActionExists(vCtx context.Context, vSpan *apm.Span, agent *model.Agent, details *UpgradeDetails) (*model.Action, error) { + action, ok := ct.cache.GetAction(details.ActionId) + if !ok { + actions, err := dl.FindAction(vCtx, ct.bulker, details.ActionId) + if err != nil { + vSpan.End() + return nil, fmt.Errorf("unable to find upgrade_details action: %w", err) + } + if len(actions) == 0 { + vSpan.End() + zerolog.Ctx(vCtx).Warn().Msgf("upgrade_details no action for id %q found (agent id %q)", details.ActionId, agent.Agent.ID) + return nil, nil + } + action = actions[0] + ct.cache.SetAction(action) + } + vSpan.End() + return &action, nil +} + // processUpgradeDetails will verify and set the upgrade_details section of an agent document based on checkin value. // if the agent doc and checkin details are both nil the method is a nop // if the checkin upgrade_details is nil but there was a previous value in the agent doc, fleet-server treats it as a successful upgrade @@ -412,24 +432,11 @@ func (ct *CheckinT) processUpgradeDetails(ctx context.Context, agent *model.Agen } // update docs with in progress details - // verify action exists vSpan, vCtx := apm.StartSpan(ctx, "Check update action", "validate") - action, ok := ct.cache.GetAction(details.ActionId) - if !ok { - actions, err := dl.FindAction(vCtx, ct.bulker, details.ActionId) - if err != nil { - vSpan.End() - return fmt.Errorf("unable to find upgrade_details action: %w", err) - } - if len(actions) == 0 { - vSpan.End() - zerolog.Ctx(vCtx).Warn().Msgf("upgrade_details no action for id %q found (agent id %q)", details.ActionId, agent.Agent.ID) - return nil - } - action = actions[0] - ct.cache.SetAction(action) + action, err := ct.verifyActionExists(ctx, vSpan, agent, details) + if err != nil { + return err } - vSpan.End() // link action with APM spans var links []apm.SpanLink From 9e6d4d48545262631df8991484364b46efff9e4e Mon Sep 17 00:00:00 2001 From: Julia Bardi Date: Mon, 10 Mar 2025 12:39:36 +0100 Subject: [PATCH 14/14] return nil if action is nil --- internal/pkg/api/handleCheckin.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/pkg/api/handleCheckin.go b/internal/pkg/api/handleCheckin.go index 11a6bfb4e9..6bbe9f877e 100644 --- a/internal/pkg/api/handleCheckin.go +++ b/internal/pkg/api/handleCheckin.go @@ -437,6 +437,9 @@ func (ct *CheckinT) processUpgradeDetails(ctx context.Context, agent *model.Agen if err != nil { return err } + if action == nil { + return nil + } // link action with APM spans var links []apm.SpanLink