diff --git a/oas_docs/bundle.json b/oas_docs/bundle.json index afc33a7c70bf8..1f5b03236870d 100644 --- a/oas_docs/bundle.json +++ b/oas_docs/bundle.json @@ -17675,6 +17675,13 @@ "nullable": true, "type": "array" }, + "upgrade_attempts": { + "items": { + "type": "string" + }, + "nullable": true, + "type": "array" + }, "upgrade_details": { "additionalProperties": false, "nullable": true, @@ -19722,6 +19729,13 @@ "nullable": true, "type": "array" }, + "upgrade_attempts": { + "items": { + "type": "string" + }, + "nullable": true, + "type": "array" + }, "upgrade_details": { "additionalProperties": false, "nullable": true, @@ -20206,6 +20220,13 @@ "nullable": true, "type": "array" }, + "upgrade_attempts": { + "items": { + "type": "string" + }, + "nullable": true, + "type": "array" + }, "upgrade_details": { "additionalProperties": false, "nullable": true, diff --git a/oas_docs/bundle.serverless.json b/oas_docs/bundle.serverless.json index af2f0731ed516..dfa3894d2a029 100644 --- a/oas_docs/bundle.serverless.json +++ b/oas_docs/bundle.serverless.json @@ -17675,6 +17675,13 @@ "nullable": true, "type": "array" }, + "upgrade_attempts": { + "items": { + "type": "string" + }, + "nullable": true, + "type": "array" + }, "upgrade_details": { "additionalProperties": false, "nullable": true, @@ -19722,6 +19729,13 @@ "nullable": true, "type": "array" }, + "upgrade_attempts": { + "items": { + "type": "string" + }, + "nullable": true, + "type": "array" + }, "upgrade_details": { "additionalProperties": false, "nullable": true, @@ -20206,6 +20220,13 @@ "nullable": true, "type": "array" }, + "upgrade_attempts": { + "items": { + "type": "string" + }, + "nullable": true, + "type": "array" + }, "upgrade_details": { "additionalProperties": false, "nullable": true, diff --git a/oas_docs/output/kibana.serverless.yaml b/oas_docs/output/kibana.serverless.yaml index 4dd1d52d3663c..82a318d81bf40 100644 --- a/oas_docs/output/kibana.serverless.yaml +++ b/oas_docs/output/kibana.serverless.yaml @@ -18992,6 +18992,11 @@ paths: type: string nullable: true type: array + upgrade_attempts: + items: + type: string + nullable: true + type: array upgrade_details: additionalProperties: false nullable: true @@ -19447,6 +19452,11 @@ paths: type: string nullable: true type: array + upgrade_attempts: + items: + type: string + nullable: true + type: array upgrade_details: additionalProperties: false nullable: true @@ -19790,6 +19800,11 @@ paths: type: string nullable: true type: array + upgrade_attempts: + items: + type: string + nullable: true + type: array upgrade_details: additionalProperties: false nullable: true diff --git a/oas_docs/output/kibana.yaml b/oas_docs/output/kibana.yaml index 68a2b5c909e2f..696f9b72e574c 100644 --- a/oas_docs/output/kibana.yaml +++ b/oas_docs/output/kibana.yaml @@ -21113,6 +21113,11 @@ paths: type: string nullable: true type: array + upgrade_attempts: + items: + type: string + nullable: true + type: array upgrade_details: additionalProperties: false nullable: true @@ -21565,6 +21570,11 @@ paths: type: string nullable: true type: array + upgrade_attempts: + items: + type: string + nullable: true + type: array upgrade_details: additionalProperties: false nullable: true @@ -21907,6 +21917,11 @@ paths: type: string nullable: true type: array + upgrade_attempts: + items: + type: string + nullable: true + type: array upgrade_details: additionalProperties: false nullable: true diff --git a/x-pack/platform/plugins/shared/fleet/common/constants/index.ts b/x-pack/platform/plugins/shared/fleet/common/constants/index.ts index f51e85f22c526..a83d626907292 100644 --- a/x-pack/platform/plugins/shared/fleet/common/constants/index.ts +++ b/x-pack/platform/plugins/shared/fleet/common/constants/index.ts @@ -58,3 +58,5 @@ export const FLEET_ENROLLMENT_API_PREFIX = 'fleet-enrollment-api-keys'; export const REQUEST_DIAGNOSTICS_TIMEOUT_MS = 3 * 60 * 60 * 1000; // 3 hours; export * from './mappings'; + +export const AUTO_UPGRADE_DEFAULT_RETRIES = ['30m', '1h', '2h', '4h', '8h', '16h', '24h']; diff --git a/x-pack/platform/plugins/shared/fleet/common/constants/mappings.ts b/x-pack/platform/plugins/shared/fleet/common/constants/mappings.ts index f3d2b200cac58..80960450820e0 100644 --- a/x-pack/platform/plugins/shared/fleet/common/constants/mappings.ts +++ b/x-pack/platform/plugins/shared/fleet/common/constants/mappings.ts @@ -363,6 +363,9 @@ export const AGENT_MAPPINGS = { }, }, }, + upgrade_attempts: { + type: 'date', + }, // added to allow validation on status field status: { type: 'keyword', diff --git a/x-pack/platform/plugins/shared/fleet/common/types/index.ts b/x-pack/platform/plugins/shared/fleet/common/types/index.ts index a85e0a86e4fcc..b6789eb33404a 100644 --- a/x-pack/platform/plugins/shared/fleet/common/types/index.ts +++ b/x-pack/platform/plugins/shared/fleet/common/types/index.ts @@ -84,6 +84,9 @@ export interface FleetConfigType { }; }; createArtifactsBulkBatchSize?: number; + autoUpgrades?: { + retryDelays?: string[]; + }; } // Calling Object.entries(PackagesGroupedByStatus) gave `status: string` diff --git a/x-pack/platform/plugins/shared/fleet/common/types/models/agent.ts b/x-pack/platform/plugins/shared/fleet/common/types/models/agent.ts index 5c1d85bfb4497..f9a9319b259d9 100644 --- a/x-pack/platform/plugins/shared/fleet/common/types/models/agent.ts +++ b/x-pack/platform/plugins/shared/fleet/common/types/models/agent.ts @@ -99,6 +99,7 @@ interface AgentBase { upgraded_at?: string | null; upgrade_started_at?: string | null; upgrade_details?: AgentUpgradeDetails; + upgrade_attempts?: string[] | null; access_api_key_id?: string; default_api_key?: string; default_api_key_id?: string; @@ -275,6 +276,10 @@ export interface FleetServerAgent { * Upgrade state of the Elastic Agent */ upgrade_details?: AgentUpgradeDetails; + /** + * List of timestamps of attempts of Elastic Agent automatic upgrades + */ + upgrade_attempts?: string[] | null; access_api_key_id?: string; agent?: FleetServerAgentMetadata; /** diff --git a/x-pack/platform/plugins/shared/fleet/public/applications/fleet/components/search_bar.test.tsx b/x-pack/platform/plugins/shared/fleet/public/applications/fleet/components/search_bar.test.tsx index 8edb28f341230..bce71f017cd15 100644 --- a/x-pack/platform/plugins/shared/fleet/public/applications/fleet/components/search_bar.test.tsx +++ b/x-pack/platform/plugins/shared/fleet/public/applications/fleet/components/search_bar.test.tsx @@ -174,7 +174,7 @@ describe('SearchBar', () => { describe('getFieldSpecs', () => { it('returns fieldSpecs for Fleet agents', () => { - expect(getFieldSpecs(AGENTS_INDEX, AGENTS_PREFIX)).toHaveLength(73); + expect(getFieldSpecs(AGENTS_INDEX, AGENTS_PREFIX)).toHaveLength(74); }); it('returns fieldSpecs for Fleet enrollment tokens', () => { diff --git a/x-pack/platform/plugins/shared/fleet/server/config.ts b/x-pack/platform/plugins/shared/fleet/server/config.ts index a1c0958442584..3147ba713d347 100644 --- a/x-pack/platform/plugins/shared/fleet/server/config.ts +++ b/x-pack/platform/plugins/shared/fleet/server/config.ts @@ -282,6 +282,11 @@ export const config: PluginConfigDescriptor = { min: 400, }) ), + autoUpgrades: schema.maybe( + schema.object({ + retryDelays: schema.maybe(schema.arrayOf(schema.string())), + }) + ), }, { validate: (configToValidate) => { diff --git a/x-pack/platform/plugins/shared/fleet/server/services/agents/helpers.ts b/x-pack/platform/plugins/shared/fleet/server/services/agents/helpers.ts index 26c08e2194409..04b9a0a6ebfb8 100644 --- a/x-pack/platform/plugins/shared/fleet/server/services/agents/helpers.ts +++ b/x-pack/platform/plugins/shared/fleet/server/services/agents/helpers.ts @@ -69,6 +69,7 @@ export function searchHitToAgent( upgraded_at: hit._source?.upgraded_at, upgrade_started_at: hit._source?.upgrade_started_at, upgrade_details: hit._source?.upgrade_details, + upgrade_attempts: hit._source?.upgrade_attempts, access_api_key_id: hit._source?.access_api_key_id, default_api_key_id: hit._source?.default_api_key_id, policy_id: hit._source?.policy_id, diff --git a/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade.ts b/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade.ts index 3976d31658d53..7f35ef59486ea 100644 --- a/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade.ts +++ b/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade.ts @@ -125,3 +125,22 @@ export async function sendUpgradeAgentsActions( return await upgradeBatch(esClient, givenAgents, outgoingErrors, options, currentSpaceId); } + +export async function sendAutomaticUpgradeAgentsActions( + soClient: SavedObjectsClientContract, + esClient: ElasticsearchClient, + options: { + agents: Agent[]; + version: string; + upgradeDurationSeconds?: number; + } +): Promise<{ actionId: string }> { + const currentSpaceId = getCurrentNamespace(soClient); + return await upgradeBatch( + esClient, + options.agents, + {}, + { ...options, isAutomatic: true }, + currentSpaceId + ); +} diff --git a/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade_action_runner.ts b/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade_action_runner.ts index 32a115f779c56..5f465c8c7f479 100644 --- a/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade_action_runner.ts +++ b/x-pack/platform/plugins/shared/fleet/server/services/agents/upgrade_action_runner.ts @@ -9,11 +9,13 @@ import type { ElasticsearchClient } from '@kbn/core/server'; import { v4 as uuidv4 } from 'uuid'; import moment from 'moment'; +import semverGte from 'semver/functions/gte'; import { getRecentUpgradeInfoForAgent, getNotUpgradeableMessage, isAgentUpgradeableToVersion, + AGENT_UPGARDE_DETAILS_SUPPORTED_VERSION, } from '../../../common/services'; import type { Agent } from '../../types'; @@ -168,6 +170,10 @@ export async function upgradeBatch( data: { upgraded_at: null, upgrade_started_at: now, + ...(options.isAutomatic && + semverGte(agent.agent?.version ?? '0.0.0', AGENT_UPGARDE_DETAILS_SUPPORTED_VERSION) + ? { upgrade_attempts: [now, ...(agent.upgrade_attempts ?? [])] } + : {}), }, })), errors diff --git a/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.test.ts b/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.test.ts index b4594575ea23b..7b15d0cfc22fc 100644 --- a/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.test.ts +++ b/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.test.ts @@ -17,7 +17,7 @@ import { agentPolicyService, appContextService } from '../services'; import { fetchAllAgentsByKuery, getAgentsByKuery, - sendUpgradeAgentsActions, + sendAutomaticUpgradeAgentsActions, } from '../services/agents'; import { isAgentUpgradeable } from '../../common/services'; import type { Agent, AgentPolicy } from '../types'; @@ -47,9 +47,10 @@ const mockedFetchAllAgentsByKuery = fetchAllAgentsByKuery as jest.MockedFunction typeof fetchAllAgentsByKuery >; const mockedGetAgentsByKuery = getAgentsByKuery as jest.MockedFunction; -const mockedSendUpgradeAgentsActions = sendUpgradeAgentsActions as jest.MockedFunction< - typeof sendUpgradeAgentsActions ->; +const mockedSendAutomaticUpgradeAgentsActions = + sendAutomaticUpgradeAgentsActions as jest.MockedFunction< + typeof sendAutomaticUpgradeAgentsActions + >; const mockedIsAgentUpgradeable = isAgentUpgradeable as jest.MockedFunction< typeof isAgentUpgradeable >; @@ -79,8 +80,8 @@ const mockDefaultAgentPolicy = () => { const generateAgents = ( nAgents: number, - agentPolicyId: string, - version: string, + agentPolicyId: string = 'agent-policy-1', + version: string = '8.15.0', status: string = 'online' ) => { return [ @@ -150,12 +151,13 @@ describe('AutomaticAgentUpgradeTask', () => { jest .spyOn(appContextService, 'getExperimentalFeatures') .mockReturnValue({ enableAutomaticAgentUpgrades: true } as any); + mockDefaultAgentPolicy(); mockedIsAgentUpgradeable.mockReturnValue(true); - mockedSendUpgradeAgentsActions.mockResolvedValue({ actionId: 'action-1' }); + mockedSendAutomaticUpgradeAgentsActions.mockResolvedValue({ actionId: 'action-1' }); }); afterEach(() => { - jest.clearAllMocks(); + jest.resetAllMocks(); }); it('Should not run if task is outdated', async () => { @@ -176,116 +178,106 @@ describe('AutomaticAgentUpgradeTask', () => { }); it('Should upgrade eligible agents', async () => { - mockDefaultAgentPolicy(); - const agents = generateAgents(10, 'agent-policy-1', '8.15.0'); + const agents = generateAgents(10); mockedGetAgentsByKuery .mockResolvedValueOnce({ total: agents.length } as any) // active agents .mockResolvedValueOnce({ total: 0 } as any); // agents on or updating to target version - mockedFetchAllAgentsByKuery.mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery([])) // agents marked for retry + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery(agents)); // active agents await runTask(); - expect(mockedSendUpgradeAgentsActions).toHaveBeenCalledWith( + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( expect.anything(), expect.anything(), { agents: agents.slice(0, 3), version: '8.18.0', - isAutomatic: true, } ); }); it('Should take agents already on target version into account', async () => { - mockDefaultAgentPolicy(); const agents = [ - ...generateAgents(10, 'agent-policy-1', '8.15.0'), - { - id: 'agent-11', - policy_id: 'agent-policy-1', - status: 'online', - agent: { version: '8.18.0' }, - }, - ] as Agent[]; + ...generateAgents(10), + ...generateAgents(1, 'agent-policy-1', '8.18.0', 'online'), + ]; mockedGetAgentsByKuery .mockResolvedValueOnce({ total: agents.length } as any) // active agents .mockResolvedValueOnce({ total: 1 } as any); // agents on or updating to target version - mockedFetchAllAgentsByKuery.mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery([])) // agents marked for retry + .mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); // active agents await runTask(); - expect(mockedSendUpgradeAgentsActions).toHaveBeenCalledWith( + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( expect.anything(), expect.anything(), { agents: agents.slice(0, 2), version: '8.18.0', - isAutomatic: true, } ); }); it('Should take agents already upgrading to target version into account', async () => { - mockDefaultAgentPolicy(); const agents = [ - ...generateAgents(10, 'agent-policy-1', '8.15.0'), - { - id: 'agent-11', - policy_id: 'agent-policy-1', - status: 'updating', - agent: { version: '8.15.0' }, - upgrade_details: { target_version: '8.18.0' }, - }, - ] as Agent[]; + ...generateAgents(10), + ...generateAgents(1, 'agent-policy-1', '8.15.0', 'updating'), + ]; mockedGetAgentsByKuery .mockResolvedValueOnce({ total: agents.length } as any) // active agents .mockResolvedValueOnce({ total: 1 } as any); // agents on or updating to target version - mockedFetchAllAgentsByKuery.mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery([])) // agents marked for retry + .mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); await runTask(); - expect(mockedSendUpgradeAgentsActions).toHaveBeenCalledWith( + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( expect.anything(), expect.anything(), { agents: agents.slice(0, 2), version: '8.18.0', - isAutomatic: true, } ); }); it('Should not attempt to upgrade already upgrading agents', async () => { - mockDefaultAgentPolicy(); const agents = generateAgents(10, 'agent-policy-1', '8.15.0', 'updating'); mockedGetAgentsByKuery .mockResolvedValueOnce({ total: agents.length } as any) // active agents .mockResolvedValueOnce({ total: agents.length } as any); // agents on or updating to target version - mockedFetchAllAgentsByKuery.mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); // active agents + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery([])) // agents marked for retry + .mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); // active agents await runTask(); - expect(mockedSendUpgradeAgentsActions).not.toHaveBeenCalled(); + expect(mockedSendAutomaticUpgradeAgentsActions).not.toHaveBeenCalled(); }); it('Should set a rollout duration for upgrade batches bigger than 10 agents', async () => { - mockDefaultAgentPolicy(); - const agents = generateAgents(100, 'agent-policy-1', '8.15.0'); + const agents = generateAgents(100); mockedGetAgentsByKuery .mockResolvedValueOnce({ total: agents.length } as any) // active agents .mockResolvedValueOnce({ total: 0 } as any); // agents on or updating to target version - mockedFetchAllAgentsByKuery.mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery([])) // agents marked for retry + .mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); // active agents await runTask(); - expect(mockedSendUpgradeAgentsActions).toHaveBeenCalledWith( + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( expect.anything(), expect.anything(), { agents: agents.slice(0, 30), version: '8.18.0', upgradeDurationSeconds: 600, - isAutomatic: true, } ); }); @@ -310,21 +302,22 @@ describe('AutomaticAgentUpgradeTask', () => { })() ); mockedGetAgentsByKuery - .mockResolvedValueOnce({ total: 0 } as any) // active agents for first policy - .mockResolvedValueOnce({ total: 10 } as any) // active agents - .mockResolvedValueOnce({ total: 0 } as any); // agents on or updating to target version + .mockResolvedValueOnce({ total: 0 } as any) // active agents for first policy batch + .mockResolvedValueOnce({ total: 10 } as any) // active agents for second policy batch + .mockResolvedValueOnce({ total: 0 } as any); // agents on or updating to target version (second policy batch) const agents = generateAgents(10, 'agent-policy-501', '8.15.0'); - mockedFetchAllAgentsByKuery.mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery([])) // agents marked for retry + .mockResolvedValue(getMockFetchAllAgentsByKuery(agents)); // active agents await runTask(); - expect(mockedSendUpgradeAgentsActions).toHaveBeenCalledWith( + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( expect.anything(), expect.anything(), { agents: agents.slice(0, 3), version: '8.18.0', - isAutomatic: true, } ); }); @@ -338,38 +331,111 @@ describe('AutomaticAgentUpgradeTask', () => { ] as AgentPolicy[]; mockAgentPolicyService.fetchAllAgentPolicies = getMockAgentPolicyFetchAllAgentPolicies(agentPolicies); - const agents = generateAgents(20, 'agent-policy-1', '8.15.0'); + const agents = generateAgents(20); const firstAgentsBatch = agents.slice(0, 10); const secondAgentsBatch = agents.slice(10); mockedGetAgentsByKuery .mockResolvedValueOnce({ total: agents.length } as any) // active agents .mockResolvedValueOnce({ total: 0 } as any); // agents on or updating to target version - mockedFetchAllAgentsByKuery.mockResolvedValue( - jest.fn(async function* () { - yield firstAgentsBatch; - yield secondAgentsBatch; - })() - ); + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery([])) // agents marked for retry + .mockResolvedValueOnce( + jest.fn(async function* () { + yield firstAgentsBatch; + yield secondAgentsBatch; + })() + ); await runTask(); - expect(mockedSendUpgradeAgentsActions).toHaveBeenCalledWith( + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( expect.anything(), expect.anything(), { agents: firstAgentsBatch, version: '8.18.0', upgradeDurationSeconds: 600, - isAutomatic: true, } ); - expect(mockedSendUpgradeAgentsActions).toHaveBeenCalledWith( + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( expect.anything(), expect.anything(), { agents: secondAgentsBatch.slice(0, 4), version: '8.18.0', - isAutomatic: true, + } + ); + }); + + it('Should pick up agents in failed upgrade state for retry if they are ready', async () => { + jest + .spyOn(appContextService, 'getConfig') + .mockReturnValue({ autoUpgrades: { retryDelays: ['10m', '20m'] } } as any); + + const agentPolicies = [ + { + id: 'agent-policy-1', + required_versions: [{ version: '8.18.0', percentage: 100 }], + }, + ] as AgentPolicy[]; + mockAgentPolicyService.fetchAllAgentPolicies = + getMockAgentPolicyFetchAllAgentPolicies(agentPolicies); + + const getDate = (minutesAgo: number) => { + return new Date(Date.now() - minutesAgo * 60000).toISOString(); + }; + + const agents = [ + { + id: 'agent-1', + policy_id: 'agent-policy-1', + status: 'online', + agent: { version: '8.15.0' }, + upgrade_details: { + target_version: '8.18.0', + state: 'UPG_FAILED', + }, + upgrade_attempts: [getDate(20)], // should be picked up + }, + { + id: 'agent-2', + policy_id: 'agent-policy-1', + status: 'online', + agent: { version: '8.15.0' }, + upgrade_details: { + target_version: '8.18.0', + state: 'UPG_FAILED', + }, + upgrade_attempts: [getDate(5)], // should NOT be picked up (not ready yet) + }, + { + id: 'agent-3', + policy_id: 'agent-policy-1', + status: 'online', + agent: { version: '8.15.0' }, + upgrade_details: { + target_version: '8.18.0', + state: 'UPG_FAILED', + }, + upgrade_attempts: [getDate(20), getDate(10), getDate(5)], // should NOT be picked up (exceeded max attempts) + }, + ] as unknown as Agent[]; + + mockedGetAgentsByKuery + .mockResolvedValueOnce({ total: agents.length } as any) // active agents + .mockResolvedValueOnce({ total: 0 } as any); // agents on or updating to target version + mockedFetchAllAgentsByKuery + .mockResolvedValueOnce(getMockFetchAllAgentsByKuery(agents)) // agents marked for retry + .mockResolvedValue(getMockFetchAllAgentsByKuery([])); + + await runTask(); + + expect(mockedSendAutomaticUpgradeAgentsActions).toHaveBeenCalledWith( + expect.anything(), + expect.anything(), + { + agents: agents.slice(0, 1), + version: '8.18.0', } ); }); diff --git a/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.ts b/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.ts index c35f79c17d3c8..c26581f8f6df9 100644 --- a/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.ts +++ b/x-pack/platform/plugins/shared/fleet/server/tasks/automatic_agent_upgrade_task.ts @@ -21,7 +21,9 @@ import { getDeleteTaskRunResult } from '@kbn/task-manager-plugin/server/task'; import type { LoggerFactory } from '@kbn/core/server'; import { errors } from '@elastic/elasticsearch'; import semverGt from 'semver/functions/gt'; +import moment from 'moment'; +import { AUTO_UPGRADE_DEFAULT_RETRIES } from '../../common/constants'; import type { Agent, AgentPolicy, @@ -33,7 +35,7 @@ import { agentPolicyService, appContextService } from '../services'; import { fetchAllAgentsByKuery, getAgentsByKuery, - sendUpgradeAgentsActions, + sendAutomaticUpgradeAgentsActions, } from '../services/agents'; import { AGENT_POLICY_SAVED_OBJECT_TYPE } from '../constants'; import { AgentStatusKueryHelper, isAgentUpgradeable } from '../../common/services'; @@ -42,7 +44,7 @@ export const TYPE = 'fleet:automatic-agent-upgrade-task'; export const VERSION = '1.0.0'; const TITLE = 'Fleet Automatic agent upgrades'; const SCOPE = ['fleet']; -const INTERVAL = '1h'; +const INTERVAL = '30m'; const TIMEOUT = '10m'; const AGENT_POLICIES_BATCHSIZE = 500; const AGENTS_BATCHSIZE = 10000; @@ -64,6 +66,7 @@ export class AutomaticAgentUpgradeTask { private logger: Logger; private wasStarted: boolean = false; private abortController = new AbortController(); + private retryDelays: string[] = []; constructor(setupContract: AutomaticAgentUpgradeTaskSetupContract) { const { core, taskManager, logFactory } = setupContract; @@ -141,6 +144,8 @@ export class AutomaticAgentUpgradeTask { const [coreStart] = await core.getStartServices(); const esClient = coreStart.elasticsearch.client.asInternalUser; const soClient = new SavedObjectsClient(coreStart.savedObjects.createInternalRepository()); + this.retryDelays = + appContextService.getConfig()?.autoUpgrades?.retryDelays ?? AUTO_UPGRADE_DEFAULT_RETRIES; try { await this.checkAgentPoliciesForAutomaticUpgrades(esClient, soClient); @@ -160,6 +165,12 @@ export class AutomaticAgentUpgradeTask { this.logger.info(`[AutomaticAgentUpgradeTask] runTask() ended${msg ? ': ' + msg : ''}`); } + private throwIfAborted() { + if (this.abortController.signal.aborted) { + throw new Error('Task was aborted'); + } + } + private async checkAgentPoliciesForAutomaticUpgrades( esClient: ElasticsearchClient, soClient: SavedObjectsClientContract @@ -179,10 +190,7 @@ export class AutomaticAgentUpgradeTask { return; } for (const agentPolicy of agentPolicyPageResults) { - if (this.abortController.signal.aborted) { - throw new Error('Task was aborted'); - } - + this.throwIfAborted(); await this.checkAgentPolicyForAutomaticUpgrades(esClient, soClient, agentPolicy); } } @@ -204,7 +212,7 @@ export class AutomaticAgentUpgradeTask { const totalActiveAgents = await this.getAgentCount( esClient, soClient, - this.getActiveAgentsKuery(agentPolicy) + `policy_id:${agentPolicy.id} AND ${AgentStatusKueryHelper.buildKueryForActiveAgents()}` ); if (totalActiveAgents === 0) { this.logger.debug( @@ -237,15 +245,6 @@ export class AutomaticAgentUpgradeTask { return res.total; } - private getActiveAgentsKuery(agentPolicy: AgentPolicy) { - return `policy_id:${agentPolicy.id} AND ${AgentStatusKueryHelper.buildKueryForActiveAgents()}`; - } - - private getOnOrUpdatingToVersionKuery(agentPolicy: AgentPolicy, version: string) { - const updatingToKuery = `(upgrade_details.target_version:${version} AND NOT upgrade_details.state:UPG_FAILED)`; - return `policy_id:${agentPolicy.id} AND (agent.version:${version} OR ${updatingToKuery})`; - } - private async processRequiredVersion( esClient: ElasticsearchClient, soClient: SavedObjectsClientContract, @@ -262,10 +261,11 @@ export class AutomaticAgentUpgradeTask { (totalActiveAgents * requiredVersion.percentage) / 100 ); // Subtract total number of agents already or on or updating to target version. + const updatingToKuery = `(upgrade_details.target_version:${requiredVersion.version} AND NOT upgrade_details.state:UPG_FAILED)`; const totalOnOrUpdatingToTargetVersionAgents = await this.getAgentCount( esClient, soClient, - this.getOnOrUpdatingToVersionKuery(agentPolicy, requiredVersion.version) + `policy_id:${agentPolicy.id} AND (agent.version:${requiredVersion.version} OR ${updatingToKuery})` ); numberOfAgentsForUpgrade -= totalOnOrUpdatingToTargetVersionAgents; // Return if target is already met. @@ -276,21 +276,44 @@ export class AutomaticAgentUpgradeTask { return; } - // Fetch all active agents assigned to the policy in batches. + // Handle retries. + const numberOfRetriedAgents = await this.processRetries( + esClient, + soClient, + agentPolicy, + requiredVersion.version + ); + numberOfAgentsForUpgrade -= numberOfRetriedAgents; + if (numberOfAgentsForUpgrade <= 0) { + return; + } + + // Fetch candidate agents assigned to the policy in batches. // NB: ideally, we would query active agents on or below the target version. Unfortunately, this is not possible because agent.version // is stored as text, so semver comparison cannot be done in the ES query (cf. https://github.com/elastic/kibana/issues/168604). // As an imperfect alternative, sort agents by version. Since versions sort alphabetically, this will not always result in ascending semver sorting. - const activeAgentsFetcher = await fetchAllAgentsByKuery(esClient, soClient, { - kuery: this.getActiveAgentsKuery(agentPolicy), + const statusKuery = + '(status:online OR status:offline OR status:enrolling OR status:degraded OR status:error OR status:orphaned)'; // active status except updating + const oldStuckInUpdatingKuery = `(NOT upgrade_details:* AND status:updating AND NOT upgraded_at:* AND upgrade_started_at < now-2h)`; // agents pre 8.12.0 (without upgrade_details) + const newStuckInUpdatingKuery = `(upgrade_details.target_version:${requiredVersion.version} AND upgrade_details.state:UPG_FAILED)`; + const agentsFetcher = await fetchAllAgentsByKuery(esClient, soClient, { + kuery: `policy_id:${agentPolicy.id} AND (NOT upgrade_attempts:*) AND (${statusKuery} OR ${oldStuckInUpdatingKuery} OR ${newStuckInUpdatingKuery})`, perPage: AGENTS_BATCHSIZE, sortField: 'agent.version', sortOrder: 'asc', }); - let { done, agents } = await this.getNextAgentsBatch(activeAgentsFetcher); + let { done, agents } = await this.getNextAgentsBatch(agentsFetcher); + if (agents.length === 0) { + this.logger.debug( + `[AutomaticAgentUpgradeTask] Agent policy ${agentPolicy.id}: no candidate agents found for upgrade (target version: ${requiredVersion.version}, percentage: ${requiredVersion.percentage})` + ); + return; + } let shouldProcessAgents = true; while (shouldProcessAgents) { + this.throwIfAborted(); numberOfAgentsForUpgrade = await this.findAndUpgradeCandidateAgents( esClient, soClient, @@ -300,7 +323,7 @@ export class AutomaticAgentUpgradeTask { agents ); if (!done && numberOfAgentsForUpgrade > 0) { - ({ done, agents } = await this.getNextAgentsBatch(activeAgentsFetcher)); + ({ done, agents } = await this.getNextAgentsBatch(agentsFetcher)); } else { shouldProcessAgents = false; } @@ -313,10 +336,65 @@ export class AutomaticAgentUpgradeTask { } } + private async processRetries( + esClient: ElasticsearchClient, + soClient: SavedObjectsClientContract, + agentPolicy: AgentPolicy, + version: string + ) { + let retriedAgentsCounter = 0; + + const retryingAgentsFetcher = await fetchAllAgentsByKuery(esClient, soClient, { + kuery: `policy_id:${agentPolicy.id} AND upgrade_details.target_version:${version} AND upgrade_details.state:UPG_FAILED AND upgrade_attempts:*`, + perPage: AGENTS_BATCHSIZE, + sortField: 'agent.version', + sortOrder: 'asc', + }); + + for await (const retryingAgentsPageResults of retryingAgentsFetcher) { + this.throwIfAborted(); + // This function will return the total number of agents marked for retry so they're included in the count of agents for upgrade. + retriedAgentsCounter += retryingAgentsPageResults.length; + + const agentsReadyForRetry = retryingAgentsPageResults.filter((agent) => + this.isAgentReadyForRetry(agent, agentPolicy) + ); + if (agentsReadyForRetry.length > 0) { + this.logger.info( + `[AutomaticAgentUpgradeTask] Agent policy ${agentPolicy.id}: retrying upgrade to ${version} for ${agentsReadyForRetry.length} agents` + ); + await sendAutomaticUpgradeAgentsActions(soClient, esClient, { + agents: agentsReadyForRetry, + version, + ...this.getUpgradeDurationSeconds(agentsReadyForRetry.length), + }); + } + } + + return retriedAgentsCounter; + } + + private isAgentReadyForRetry(agent: Agent, agentPolicy: AgentPolicy) { + if (!agent.upgrade_attempts) { + return false; + } + if (agent.upgrade_attempts.length > this.retryDelays.length) { + this.logger.debug( + `[AutomaticAgentUpgradeTask] Agent policy ${agentPolicy.id}: max retry attempts exceeded for agent ${agent.id}` + ); + return false; + } + const currentRetryDelay = moment + .duration('PT' + this.retryDelays[agent.upgrade_attempts.length - 1].toUpperCase()) // https://momentjs.com/docs/#/durations/ + .asMilliseconds(); + const lastUpgradeAttempt = Date.parse(agent.upgrade_attempts[0]); + return Date.now() - lastUpgradeAttempt >= currentRetryDelay; + } + private async getNextAgentsBatch(agentsFetcher: AsyncIterable) { const agentsFetcherIter = agentsFetcher[Symbol.asyncIterator](); const agentsBatch = await agentsFetcherIter.next(); - const agents: Agent[] = agentsBatch.value; + const agents: Agent[] = agentsBatch.value ?? []; return { done: agentsBatch.done, agents: agents.filter((agent): agent is AgentWithDefinedVersion => agent.agent !== undefined), @@ -347,11 +425,10 @@ export class AutomaticAgentUpgradeTask { this.logger.info( `[AutomaticAgentUpgradeTask] Agent policy ${agentPolicy.id}: sending bulk upgrade to ${version} for ${agentsForUpgrade.length} agents` ); - await sendUpgradeAgentsActions(soClient, esClient, { + await sendAutomaticUpgradeAgentsActions(soClient, esClient, { agents: agentsForUpgrade, version, ...this.getUpgradeDurationSeconds(agentsForUpgrade.length), - isAutomatic: true, }); } @@ -359,13 +436,7 @@ export class AutomaticAgentUpgradeTask { } private isAgentEligibleForUpgrade(agent: AgentWithDefinedVersion, version: string) { - return ( - isAgentUpgradeable(agent) && - (agent.status !== 'updating' || - (AgentStatusKueryHelper.isStuckInUpdating(agent) && - agent.upgrade_details?.target_version === version)) && - semverGt(version, agent.agent.version) - ); + return isAgentUpgradeable(agent) && semverGt(version, agent.agent.version); } private getUpgradeDurationSeconds(nAgents: number) { diff --git a/x-pack/platform/plugins/shared/fleet/server/types/rest_spec/agent.ts b/x-pack/platform/plugins/shared/fleet/server/types/rest_spec/agent.ts index 3be93963d0a60..2c9cd31102d5d 100644 --- a/x-pack/platform/plugins/shared/fleet/server/types/rest_spec/agent.ts +++ b/x-pack/platform/plugins/shared/fleet/server/types/rest_spec/agent.ts @@ -161,6 +161,9 @@ export const AgentResponseSchema = schema.object({ }), ]) ), + upgrade_attempts: schema.maybe( + schema.oneOf([schema.literal(null), schema.arrayOf(schema.string())]) + ), access_api_key_id: schema.maybe(schema.string()), default_api_key: schema.maybe(schema.string()), default_api_key_id: schema.maybe(schema.string()),