Skip to content

Commit fa543ec

Browse files
Clear agent.upgrade_attempts on upgrade complete (#4528) (#4777)
* Clear agent.upgrade_attemps on upgrade complete * This actually works * Silence nolintlint error in handleCheckin.go * Remove nolint comment altogether * Add changelog * Update handleCheckin unit test * Change approach * Revert unit test change * This seems needed * Run make generate * Remove internal link * add unit test * reduce complexity * return nil if action is nil --------- Co-authored-by: Julia Bardi <[email protected]> Co-authored-by: Julia Bardi <[email protected]> (cherry picked from commit 2b40416) Co-authored-by: Jill Guyonnet <[email protected]>
1 parent fe1af84 commit fa543ec

File tree

6 files changed

+106
-17
lines changed

6 files changed

+106
-17
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: enhancement
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Clear agent.upgrade_attempts when upgrade is complete.
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
description: |
20+
The new AutomaticAgentUpgradeTask Kibana task sets the upgrade_attempts property in agents it upgrades.
21+
This property is used to track upgrade retries and should therefore be cleared when the upgrade is complete.
22+
23+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
24+
component: fleet-server
25+
26+
# PR URL; optional; the PR number that added the changeset.
27+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
28+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
29+
# Please provide it if you are adding a fragment for a different PR.
30+
pr: https://github.com/elastic/fleet-server/pull/4528
31+
32+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
33+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
34+
# issue:

internal/pkg/api/handleCheckin.go

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ func (ct *CheckinT) validateRequest(zlog zerolog.Logger, w http.ResponseWriter,
219219
}
220220

221221
wTime := pollDuration + time.Minute
222-
rc := http.NewResponseController(w) //nolint:bodyclose // we are working with a ResponseWriter not a Respons
222+
rc := http.NewResponseController(w)
223223
if err := rc.SetWriteDeadline(start.Add(wTime)); err != nil {
224224
zlog.Warn().Err(err).Time("write_deadline", start.Add(wTime)).Msg("Unable to set checkin write deadline.")
225225
} else {
@@ -398,6 +398,26 @@ func (ct *CheckinT) ProcessRequest(zlog zerolog.Logger, w http.ResponseWriter, r
398398
return ct.writeResponse(zlog, w, r, agent, resp)
399399
}
400400

401+
func (ct *CheckinT) verifyActionExists(vCtx context.Context, vSpan *apm.Span, agent *model.Agent, details *UpgradeDetails) (*model.Action, error) {
402+
action, ok := ct.cache.GetAction(details.ActionId)
403+
if !ok {
404+
actions, err := dl.FindAction(vCtx, ct.bulker, details.ActionId)
405+
if err != nil {
406+
vSpan.End()
407+
return nil, fmt.Errorf("unable to find upgrade_details action: %w", err)
408+
}
409+
if len(actions) == 0 {
410+
vSpan.End()
411+
zerolog.Ctx(vCtx).Warn().Msgf("upgrade_details no action for id %q found (agent id %q)", details.ActionId, agent.Agent.ID)
412+
return nil, nil
413+
}
414+
action = actions[0]
415+
ct.cache.SetAction(action)
416+
}
417+
vSpan.End()
418+
return &action, nil
419+
}
420+
401421
// processUpgradeDetails will verify and set the upgrade_details section of an agent document based on checkin value.
402422
// if the agent doc and checkin details are both nil the method is a nop
403423
// if the checkin upgrade_details is nil but there was a previous value in the agent doc, fleet-server treats it as a successful upgrade
@@ -412,24 +432,14 @@ func (ct *CheckinT) processUpgradeDetails(ctx context.Context, agent *model.Agen
412432
}
413433
// update docs with in progress details
414434

415-
// verify action exists
416435
vSpan, vCtx := apm.StartSpan(ctx, "Check update action", "validate")
417-
action, ok := ct.cache.GetAction(details.ActionId)
418-
if !ok {
419-
actions, err := dl.FindAction(vCtx, ct.bulker, details.ActionId)
420-
if err != nil {
421-
vSpan.End()
422-
return fmt.Errorf("unable to find upgrade_details action: %w", err)
423-
}
424-
if len(actions) == 0 {
425-
vSpan.End()
426-
zerolog.Ctx(vCtx).Warn().Msgf("upgrade_details no action for id %q found (agent id %q)", details.ActionId, agent.Agent.ID)
427-
return nil
428-
}
429-
action = actions[0]
430-
ct.cache.SetAction(action)
436+
action, err := ct.verifyActionExists(ctx, vSpan, agent, details)
437+
if err != nil {
438+
return err
439+
}
440+
if action == nil {
441+
return nil
431442
}
432-
vSpan.End()
433443

434444
// link action with APM spans
435445
var links []apm.SpanLink
@@ -500,6 +510,10 @@ func (ct *CheckinT) processUpgradeDetails(ctx context.Context, agent *model.Agen
500510
doc := bulk.UpdateFields{
501511
dl.FieldUpgradeDetails: details,
502512
}
513+
if agent.UpgradeAttempts != nil && details.State == UpgradeDetailsStateUPGWATCHING {
514+
doc[dl.FieldUpgradeAttempts] = nil
515+
}
516+
503517
body, err := doc.Marshal()
504518
if err != nil {
505519
return err

internal/pkg/api/handleCheckin_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"testing"
1818
"time"
1919

20+
"github.com/elastic/fleet-server/v7/internal/pkg/bulk"
2021
"github.com/elastic/fleet-server/v7/internal/pkg/cache"
2122
"github.com/elastic/fleet-server/v7/internal/pkg/checkin"
2223
"github.com/elastic/fleet-server/v7/internal/pkg/config"
@@ -312,6 +313,17 @@ func TestResolveSeqNo(t *testing.T) {
312313

313314
func TestProcessUpgradeDetails(t *testing.T) {
314315
esd := model.ESDocument{Id: "doc-ID"}
316+
doc := bulk.UpdateFields{
317+
dl.FieldUpgradeDetails: &UpgradeDetails{
318+
ActionId: "test-action",
319+
State: UpgradeDetailsStateUPGWATCHING,
320+
},
321+
dl.FieldUpgradeAttempts: nil,
322+
}
323+
body, err := doc.Marshal()
324+
if err != nil {
325+
t.Fatalf("marshal error: %v", err)
326+
}
315327
tests := []struct {
316328
name string
317329
agent *model.Agent
@@ -625,6 +637,24 @@ func TestProcessUpgradeDetails(t *testing.T) {
625637
return mCache
626638
},
627639
err: ErrInvalidUpgradeMetadata,
640+
}, {
641+
name: "clear upgrade attempts when watching",
642+
agent: &model.Agent{ESDocument: esd, Agent: &model.AgentMetadata{ID: "test-agent"}, UpgradeAttempts: make([]string, 0)},
643+
details: &UpgradeDetails{
644+
ActionId: "test-action",
645+
State: UpgradeDetailsStateUPGWATCHING,
646+
},
647+
bulk: func() *ftesting.MockBulk {
648+
mBulk := ftesting.NewMockBulk()
649+
mBulk.On("Update", mock.Anything, dl.FleetAgents, "doc-ID", body, mock.Anything, mock.Anything).Return(nil)
650+
return mBulk
651+
},
652+
cache: func() *testcache.MockCache {
653+
mCache := testcache.NewMockCache()
654+
mCache.On("GetAction", "test-action").Return(model.Action{}, true)
655+
return mCache
656+
},
657+
err: nil,
628658
}}
629659

630660
for _, tc := range tests {

internal/pkg/dl/constants.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ const (
5454
FieldUpgradeStartedAt = "upgrade_started_at"
5555
FieldUpgradeStatus = "upgrade_status"
5656
FieldUpgradeDetails = "upgrade_details"
57+
FieldUpgradeAttempts = "upgrade_attempts"
5758

5859
FieldAuditUnenrolledTime = "audit_unenrolled_time"
5960
FieldAuditUnenrolledReason = "audit_unenrolled_reason"

internal/pkg/model/schema.go

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

model/schema.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,13 @@
686686
"description": "Additional upgrade status details.",
687687
"type": "object"
688688
},
689+
"upgrade_attempts": {
690+
"description": "List of timestamps of attempts of Elastic Agent automatic upgrades",
691+
"type": "array",
692+
"items": {
693+
"type": "string"
694+
}
695+
},
689696
"replace_token": {
690697
"description": "hash of token provided during enrollment that allows replacement by another enrollment with same ID",
691698
"type": "string"

0 commit comments

Comments
 (0)