Skip to content

Commit 348ae36

Browse files
kibanamachinenkhristininelasticmachine
authored
[8.18] Fix that gap can be stuck "in-progress" (elastic#221473) (elastic#224176)
# Backport This will backport the following commits from `main` to `8.18`: - Fix that gap can be stuck "in-progress" (elastic#221473) (dfd783e) <!--- Backport version: 9.6.6 --> ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sorenlouv/backport) <!--BACKPORT [{"author":{"name":"Khristinin Nikita","email":"[email protected]"},"sourceCommit":{"committedDate":"2025-06-17T06:47:01Z","message":"Fix that gap can be stuck \"in-progress\" (elastic#221473)\n\n## Summary\n\n\n[[Issue](https://github.com/elastic/kibana/issues/221111)](https://github.com/elastic/kibana/issues/221111)\n\nGaps can get stuck in the `in-progress` state if a rule is\nbackfill-executed with failures.\n\n### Current behavior:\n\nLet's say we have a gap from `12:00–13:00`.\n\nWhen the gap is initially detected, it has the following state:\n\n```\nfilled_intervals: []\nunfilled_intervals: [12:00–13:00]\nin_progress_intervals: []\n```\n\nWhen a backfill starts, we set `in_progress_intervals` to the range that\noverlaps with the backfill. We also remove that range from\n`unfilled_intervals`:\n\n```\nfilled_intervals: []\nunfilled_intervals: []\nin_progress_intervals: [12:00–13:00]\n```\n\nAfter the backfill is successfully executed, we move the range to\n`filled_intervals` and clear `in_progress_intervals`:\n\n```\nfilled_intervals: [12:00–13:00]\nunfilled_intervals: []\nin_progress_intervals: []\n```\n\nHowever, if the backfill fails, we want to remove the range from\n`in_progress_intervals` and move it back to `unfilled_intervals`. The\nproblem is that we cannot simply do this because there might be other\noverlapping backfills still in progress for the same gap. In the case of\na successful execution, this isn’t an issue, as the range is moved to\n`filled_intervals`.\n\nWhen a backfill fails, we refetch all overlapping backfills for the gap\nto recalculate the `in_progress_intervals`.\n\n### Problem\n\nIn the current implementation, we're updating the gaps **before**\ndeleting the failed backfill. This causes the recalculated\n`in_progress_intervals` to still include the failed backfill’s range,\nresulting in a stale state.\n\n### Fix\n\nWe should **first delete** the failed backfill, and **then** update the\ngap. This ensures that the recalculated `in_progress_intervals` reflect\nonly the remaining active backfills.\n\n---------\n\nCo-authored-by: Elastic Machine <[email protected]>","sha":"dfd783e12a4046758be75c05bbe36bc105710296"},"sourceBranch":"main","suggestedTargetBranches":[],"targetPullRequestStates":[]}] BACKPORT--> Co-authored-by: Khristinin Nikita <[email protected]> Co-authored-by: Elastic Machine <[email protected]>
1 parent 3ff5b72 commit 348ae36

File tree

2 files changed

+15
-10
lines changed

2 files changed

+15
-10
lines changed

x-pack/platform/plugins/shared/alerting/server/task_runner/ad_hoc_task_runner.test.ts

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -888,9 +888,14 @@ describe('Ad Hoc Task Runner', () => {
888888
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
889889
AD_HOC_RUN_SAVED_OBJECT_TYPE,
890890
'abc',
891-
{ refresh: false, namespace: undefined }
891+
{ refresh: true, namespace: undefined }
892892
);
893893

894+
// Verify that updateGaps was called after delete
895+
const deleteCallOrder = internalSavedObjectsRepository.delete.mock.invocationCallOrder[0];
896+
const updateGapsCallOrder = mockUpdateGaps.mock.invocationCallOrder[0];
897+
expect(updateGapsCallOrder).toBeGreaterThan(deleteCallOrder);
898+
894899
expect(mockUpdateGaps).toHaveBeenCalledWith({
895900
ruleId: RULE_ID,
896901
start: new Date(mockedAdHocRunSO.attributes.start),
@@ -969,7 +974,7 @@ describe('Ad Hoc Task Runner', () => {
969974
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
970975
AD_HOC_RUN_SAVED_OBJECT_TYPE,
971976
'abc',
972-
{ refresh: false, namespace: undefined }
977+
{ refresh: true, namespace: undefined }
973978
);
974979

975980
testAlertingEventLogCalls({
@@ -1031,7 +1036,7 @@ describe('Ad Hoc Task Runner', () => {
10311036
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
10321037
AD_HOC_RUN_SAVED_OBJECT_TYPE,
10331038
'abc',
1034-
{ refresh: false, namespace: undefined }
1039+
{ refresh: true, namespace: undefined }
10351040
);
10361041

10371042
testAlertingEventLogCalls({
@@ -1093,7 +1098,7 @@ describe('Ad Hoc Task Runner', () => {
10931098
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
10941099
AD_HOC_RUN_SAVED_OBJECT_TYPE,
10951100
'abc',
1096-
{ refresh: false, namespace: undefined }
1101+
{ refresh: true, namespace: undefined }
10971102
);
10981103

10991104
testAlertingEventLogCalls({
@@ -1157,7 +1162,7 @@ describe('Ad Hoc Task Runner', () => {
11571162
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
11581163
AD_HOC_RUN_SAVED_OBJECT_TYPE,
11591164
'abc',
1160-
{ refresh: false, namespace: undefined }
1165+
{ refresh: true, namespace: undefined }
11611166
);
11621167

11631168
testAlertingEventLogCalls({
@@ -1314,7 +1319,7 @@ describe('Ad Hoc Task Runner', () => {
13141319
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
13151320
AD_HOC_RUN_SAVED_OBJECT_TYPE,
13161321
'abc',
1317-
{ refresh: false, namespace: undefined }
1322+
{ refresh: true, namespace: undefined }
13181323
);
13191324

13201325
testAlertingEventLogCalls({
@@ -1463,7 +1468,7 @@ describe('Ad Hoc Task Runner', () => {
14631468
expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
14641469
AD_HOC_RUN_SAVED_OBJECT_TYPE,
14651470
mockedAdHocRunSO.id,
1466-
{ namespace: undefined, refresh: false }
1471+
{ namespace: undefined, refresh: true }
14671472
);
14681473

14691474
testAlertingEventLogCalls({

x-pack/platform/plugins/shared/alerting/server/task_runner/ad_hoc_task_runner.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -620,17 +620,17 @@ export class AdHocTaskRunner implements CancellableTask {
620620
async cleanup() {
621621
if (!this.shouldDeleteTask) return;
622622

623-
await this.updateGapsAfterBackfillComplete();
624-
625623
try {
626624
await this.internalSavedObjectsRepository.delete(
627625
AD_HOC_RUN_SAVED_OBJECT_TYPE,
628626
this.taskInstance.params.adHocRunParamsId,
629627
{
630-
refresh: false,
628+
refresh: true,
631629
namespace: this.context.spaceIdToNamespace(this.taskInstance.params.spaceId),
632630
}
633631
);
632+
633+
await this.updateGapsAfterBackfillComplete();
634634
} catch (e) {
635635
// Log error only, we shouldn't fail the task because of an error here (if ever there's retry logic)
636636
this.logger.error(

0 commit comments

Comments
 (0)