Skip to content

Commit e9cfd81

Browse files
authored
Retry downsample ILM action using a new target index (#94965)
Currently, when the ILM downample step is being retried, the same target index is used. This can cause the subsequent downsample API invocation to index rolled up data into shards of the target index that already exists and while the previous downsample API invocation is still partially running (and also rolling up data into the same target shard). Note that, the downsample step may fail in case a cluster is being restarted in a rolling manner (for example for an upgrade) or when the elected master node fails (the downsample action is coordinated from the elected master node). This PR modfies the ILM DownsampleAction so that when DownsampleStep fails, it will retry by going performing the following steps 1. Cleanup existing target index, 2. Generate a new index name for the target index 3. Downsample using the new target index name. Note 1: This change may leave some garbage indices that we must find another way to cleanup. However, the downsample process will become more resilient. Note 2: A similar approach is used by the searchable_snapshot ILM action Closes #93580
1 parent 7370d1b commit e9cfd81

File tree

7 files changed

+227
-123
lines changed

7 files changed

+227
-123
lines changed

docs/changelog/94965.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 94965
2+
summary: Retry downsample ILM action using a new target index
3+
area: "ILM+SLM"
4+
type: bug
5+
issues:
6+
- 93580

server/src/main/java/org/elasticsearch/cluster/metadata/LifecycleExecutionState.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public static Builder builder(LifecycleExecutionState state) {
8181
.setSnapshotName(state.snapshotName)
8282
.setShrinkIndexName(state.shrinkIndexName)
8383
.setSnapshotIndexName(state.snapshotIndexName)
84-
.setRollupIndexName(state.downsampleIndexName)
84+
.setDownsampleIndexName(state.downsampleIndexName)
8585
.setStepTime(state.stepTime);
8686
}
8787

@@ -187,9 +187,9 @@ public static LifecycleExecutionState fromCustomMetadata(Map<String, String> cus
187187
if (snapshotIndexName != null) {
188188
builder.setSnapshotIndexName(snapshotIndexName);
189189
}
190-
String rollupIndexName = customData.get(DOWNSAMPLE_INDEX_NAME);
191-
if (rollupIndexName != null) {
192-
builder.setRollupIndexName(rollupIndexName);
190+
String downsampleIndexName = customData.get(DOWNSAMPLE_INDEX_NAME);
191+
if (downsampleIndexName != null) {
192+
builder.setDownsampleIndexName(downsampleIndexName);
193193
}
194194
return builder.build();
195195
}
@@ -273,7 +273,7 @@ public static class Builder {
273273
private String snapshotRepository;
274274
private String shrinkIndexName;
275275
private String snapshotIndexName;
276-
private String rollupIndexName;
276+
private String downsampleIndexName;
277277

278278
public Builder setPhase(String phase) {
279279
this.phase = phase;
@@ -355,8 +355,8 @@ public Builder setSnapshotIndexName(String snapshotIndexName) {
355355
return this;
356356
}
357357

358-
public Builder setRollupIndexName(String rollupIndexName) {
359-
this.rollupIndexName = rollupIndexName;
358+
public Builder setDownsampleIndexName(String downsampleIndexName) {
359+
this.downsampleIndexName = downsampleIndexName;
360360
return this;
361361
}
362362

@@ -378,7 +378,7 @@ public LifecycleExecutionState build() {
378378
snapshotName,
379379
shrinkIndexName,
380380
snapshotIndexName,
381-
rollupIndexName
381+
downsampleIndexName
382382
);
383383
}
384384
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DownsampleAction.java

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,10 @@ public List<Step> toSteps(Client client, String phase, StepKey nextStepKey) {
106106
StepKey checkNotWriteIndex = new StepKey(phase, NAME, CheckNotDataStreamWriteIndexStep.NAME);
107107
StepKey waitForNoFollowerStepKey = new StepKey(phase, NAME, WaitForNoFollowersStep.NAME);
108108
StepKey readOnlyKey = new StepKey(phase, NAME, ReadOnlyStep.NAME);
109-
StepKey cleanupRollupIndexKey = new StepKey(phase, NAME, CleanupTargetIndexStep.NAME);
110-
StepKey generateRollupIndexNameKey = new StepKey(phase, NAME, GENERATE_DOWNSAMPLE_STEP_NAME);
111-
StepKey rollupKey = new StepKey(phase, NAME, DownsampleStep.NAME);
112-
StepKey waitForRollupIndexKey = new StepKey(phase, NAME, WaitForIndexColorStep.NAME);
109+
StepKey cleanupDownsampleIndexKey = new StepKey(phase, NAME, CleanupTargetIndexStep.NAME);
110+
StepKey generateDownsampleIndexNameKey = new StepKey(phase, NAME, GENERATE_DOWNSAMPLE_STEP_NAME);
111+
StepKey downsampleKey = new StepKey(phase, NAME, DownsampleStep.NAME);
112+
StepKey waitForDownsampleIndexKey = new StepKey(phase, NAME, WaitForIndexColorStep.NAME);
113113
StepKey copyMetadataKey = new StepKey(phase, NAME, CopyExecutionStateStep.NAME);
114114
StepKey dataStreamCheckBranchingKey = new StepKey(phase, NAME, CONDITIONAL_DATASTREAM_CHECK_KEY);
115115
StepKey replaceDataStreamIndexKey = new StepKey(phase, NAME, ReplaceDataStreamBackingIndexStep.NAME);
@@ -132,44 +132,51 @@ public List<Step> toSteps(Client client, String phase, StepKey nextStepKey) {
132132
checkNotWriteIndex,
133133
waitForNoFollowerStepKey
134134
);
135-
WaitForNoFollowersStep waitForNoFollowersStep = new WaitForNoFollowersStep(waitForNoFollowerStepKey, cleanupRollupIndexKey, client);
136-
137-
// We generate a unique rollup index name, but we also retry if the allocation of the rollup index is not possible, so we want to
138-
// delete the "previously generated" rollup index (this is a no-op if it's the first run of the action, and we haven't generated a
139-
// rollup index name)
140-
CleanupTargetIndexStep cleanupRollupIndexStep = new CleanupTargetIndexStep(
141-
cleanupRollupIndexKey,
142-
readOnlyKey,
135+
WaitForNoFollowersStep waitForNoFollowersStep = new WaitForNoFollowersStep(waitForNoFollowerStepKey, readOnlyKey, client);
136+
137+
// Mark source index as read-only
138+
ReadOnlyStep readOnlyStep = new ReadOnlyStep(readOnlyKey, cleanupDownsampleIndexKey, client);
139+
140+
// We generate a unique downsample index name, but we also retry if the allocation of the downsample index
141+
// is not possible, so we want to delete the "previously generated" downsample index (this is a no-op if it's
142+
// the first run of the action, and we haven't generated a downsample index name)
143+
CleanupTargetIndexStep cleanupDownsampleIndexStep = new CleanupTargetIndexStep(
144+
cleanupDownsampleIndexKey,
145+
generateDownsampleIndexNameKey,
143146
client,
144147
(indexMetadata) -> IndexMetadata.INDEX_DOWNSAMPLE_SOURCE_NAME.get(indexMetadata.getSettings()),
145148
(indexMetadata) -> indexMetadata.getLifecycleExecutionState().downsampleIndexName()
146149
);
147-
// Mark source index as read-only
148-
ReadOnlyStep readOnlyStep = new ReadOnlyStep(readOnlyKey, generateRollupIndexNameKey, client);
149150

150-
// Generate a unique rollup index name and store it in the ILM execution state
151-
GenerateUniqueIndexNameStep generateRollupIndexNameStep = new GenerateUniqueIndexNameStep(
152-
generateRollupIndexNameKey,
153-
rollupKey,
151+
// Generate a unique downsample index name and store it in the ILM execution state
152+
GenerateUniqueIndexNameStep generateDownsampleIndexNameStep = new GenerateUniqueIndexNameStep(
153+
generateDownsampleIndexNameKey,
154+
downsampleKey,
154155
DOWNSAMPLED_INDEX_PREFIX,
155-
(rollupIndexName, lifecycleStateBuilder) -> lifecycleStateBuilder.setRollupIndexName(rollupIndexName)
156+
(downsampleIndexName, lifecycleStateBuilder) -> lifecycleStateBuilder.setDownsampleIndexName(downsampleIndexName)
156157
);
157158

158-
// Here is where the actual rollup action takes place
159-
DownsampleStep rollupStep = new DownsampleStep(rollupKey, waitForRollupIndexKey, client, fixedInterval);
159+
// Here is where the actual downsample action takes place
160+
DownsampleStep downsampleStep = new DownsampleStep(
161+
downsampleKey,
162+
waitForDownsampleIndexKey,
163+
cleanupDownsampleIndexKey,
164+
client,
165+
fixedInterval
166+
);
160167

161168
// Wait until the downsampled index is recovered. We again wait until the configured threshold is breached and
162-
// if the downsampled index has not successfully recovered until then, we rewind to the "cleanup-rollup-index"
163-
// step to delete this unsuccessful downsampled index and retry the operation by generating a new downsampled index
169+
// if the downsampled index has not successfully recovered until then, we rewind to the "cleanup-downsample-index"
170+
// step to delete this unsuccessful downsampled index and retry the operation by generating a new downsample index
164171
// name and attempting to downsample again.
165-
ClusterStateWaitUntilThresholdStep rollupAllocatedStep = new ClusterStateWaitUntilThresholdStep(
172+
ClusterStateWaitUntilThresholdStep downsampleAllocatedStep = new ClusterStateWaitUntilThresholdStep(
166173
new WaitForIndexColorStep(
167-
waitForRollupIndexKey,
174+
waitForDownsampleIndexKey,
168175
copyMetadataKey,
169176
ClusterHealthStatus.YELLOW,
170177
(indexName, lifecycleState) -> lifecycleState.downsampleIndexName()
171178
),
172-
cleanupRollupIndexKey
179+
cleanupDownsampleIndexKey
173180
);
174181

175182
CopyExecutionStateStep copyExecutionStateStep = new CopyExecutionStateStep(
@@ -213,11 +220,11 @@ public List<Step> toSteps(Client client, String phase, StepKey nextStepKey) {
213220
isTimeSeriesIndexBranchingStep,
214221
checkNotWriteIndexStep,
215222
waitForNoFollowersStep,
216-
cleanupRollupIndexStep,
217223
readOnlyStep,
218-
generateRollupIndexNameStep,
219-
rollupStep,
220-
rollupAllocatedStep,
224+
cleanupDownsampleIndexStep,
225+
generateDownsampleIndexNameStep,
226+
downsampleStep,
227+
downsampleAllocatedStep,
221228
copyExecutionStateStep,
222229
isDataStreamBranchingStep,
223230
replaceDataStreamBackingIndex,

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DownsampleStep.java

Lines changed: 61 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import org.apache.logging.log4j.LogManager;
1010
import org.apache.logging.log4j.Logger;
1111
import org.elasticsearch.action.ActionListener;
12-
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
1312
import org.elasticsearch.client.internal.Client;
1413
import org.elasticsearch.cluster.ClusterState;
1514
import org.elasticsearch.cluster.ClusterStateObserver;
@@ -35,9 +34,20 @@ public class DownsampleStep extends AsyncActionStep {
3534
private static final Logger logger = LogManager.getLogger(DownsampleStep.class);
3635

3736
private final DateHistogramInterval fixedInterval;
37+
private final StepKey nextStepOnSuccess;
38+
private final StepKey nextStepOnFailure;
39+
private volatile boolean downsampleFailed;
3840

39-
public DownsampleStep(StepKey key, StepKey nextStepKey, Client client, DateHistogramInterval fixedInterval) {
40-
super(key, nextStepKey, client);
41+
public DownsampleStep(
42+
StepKey key,
43+
StepKey nextStepOnSuccess,
44+
StepKey nextStepOnFailure,
45+
Client client,
46+
DateHistogramInterval fixedInterval
47+
) {
48+
super(key, null, client);
49+
this.nextStepOnSuccess = nextStepOnSuccess;
50+
this.nextStepOnFailure = nextStepOnFailure;
4151
this.fixedInterval = fixedInterval;
4252
}
4353

@@ -62,72 +72,63 @@ public void performAction(
6272
final String indexName = indexMetadata.getIndex().getName();
6373
final String downsampleIndexName = lifecycleState.downsampleIndexName();
6474
if (Strings.hasText(downsampleIndexName) == false) {
75+
downsampleFailed = true;
6576
listener.onFailure(
6677
new IllegalStateException(
67-
"rollup index name was not generated for policy [" + policyName + "] and index [" + indexName + "]"
78+
"downsample index name was not generated for policy [" + policyName + "] and index [" + indexName + "]"
6879
)
6980
);
7081
return;
7182
}
7283

73-
IndexMetadata rollupIndexMetadata = currentState.metadata().index(downsampleIndexName);
74-
if (rollupIndexMetadata != null) {
75-
IndexMetadata.DownsampleTaskStatus rollupIndexStatus = IndexMetadata.INDEX_DOWNSAMPLE_STATUS.get(
76-
rollupIndexMetadata.getSettings()
84+
IndexMetadata downsampleIndexMetadata = currentState.metadata().index(downsampleIndexName);
85+
if (downsampleIndexMetadata != null) {
86+
IndexMetadata.DownsampleTaskStatus downsampleIndexStatus = IndexMetadata.INDEX_DOWNSAMPLE_STATUS.get(
87+
downsampleIndexMetadata.getSettings()
7788
);
78-
// Rollup index has already been created with the generated name and its status is "success".
79-
// So we skip index rollup creation.
80-
if (IndexMetadata.DownsampleTaskStatus.SUCCESS.equals(rollupIndexStatus)) {
89+
if (IndexMetadata.DownsampleTaskStatus.SUCCESS.equals(downsampleIndexStatus)) {
90+
// Downsample index has already been created with the generated name and its status is "success".
91+
// So we skip index downsample creation.
8192
logger.warn(
82-
"skipping [{}] step for index [{}] as part of policy [{}] as the rollup index [{}] already exists",
93+
"skipping [{}] step for index [{}] as part of policy [{}] as the downsample index [{}] already exists",
8394
DownsampleStep.NAME,
8495
indexName,
8596
policyName,
8697
downsampleIndexName
8798
);
8899
listener.onResponse(null);
89100
} else {
90-
logger.warn(
91-
"[{}] step for index [{}] as part of policy [{}] found the rollup index [{}] already exists. Deleting it.",
92-
DownsampleStep.NAME,
93-
indexName,
94-
policyName,
95-
downsampleIndexName
101+
// Downsample index has already been created with the generated name but its status is not "success".
102+
// So we fail this step so that we go back to cleaning up the index and try again with a new downsample
103+
// index name.
104+
downsampleFailed = true;
105+
listener.onFailure(
106+
new IllegalStateException(
107+
"failing ["
108+
+ DownsampleStep.NAME
109+
+ "] step for index ["
110+
+ indexName
111+
+ "] as part of policy ["
112+
+ policyName
113+
+ "] because the downsample index ["
114+
+ downsampleIndexName
115+
+ "] already exists with downsample status ["
116+
+ downsampleIndexStatus
117+
+ "]"
118+
)
96119
);
97-
// Rollup index has already been created with the generated name but its status is not "success".
98-
// So we delete the index and proceed with executing the rollup step.
99-
DeleteIndexRequest deleteRequest = new DeleteIndexRequest(downsampleIndexName);
100-
getClient().admin().indices().delete(deleteRequest, ActionListener.wrap(response -> {
101-
if (response.isAcknowledged()) {
102-
performDownsampleIndex(indexName, downsampleIndexName, listener);
103-
} else {
104-
listener.onFailure(
105-
new IllegalStateException(
106-
"failing ["
107-
+ DownsampleStep.NAME
108-
+ "] step for index ["
109-
+ indexName
110-
+ "] as part of policy ["
111-
+ policyName
112-
+ "] because the rollup index ["
113-
+ downsampleIndexName
114-
+ "] already exists with rollup status ["
115-
+ rollupIndexStatus
116-
+ "]"
117-
)
118-
);
119-
}
120-
}, listener::onFailure));
121120
}
122-
return;
121+
} else {
122+
performDownsampleIndex(indexName, downsampleIndexName, ActionListener.wrap(listener::onResponse, e -> {
123+
downsampleFailed = true;
124+
listener.onFailure(e);
125+
}));
123126
}
124-
125-
performDownsampleIndex(indexName, downsampleIndexName, listener);
126127
}
127128

128-
private void performDownsampleIndex(String indexName, String rollupIndexName, ActionListener<Void> listener) {
129+
void performDownsampleIndex(String indexName, String downsampleIndexName, ActionListener<Void> listener) {
129130
DownsampleConfig config = new DownsampleConfig(fixedInterval);
130-
DownsampleAction.Request request = new DownsampleAction.Request(indexName, rollupIndexName, config).masterNodeTimeout(
131+
DownsampleAction.Request request = new DownsampleAction.Request(indexName, downsampleIndexName, config).masterNodeTimeout(
131132
TimeValue.MAX_VALUE
132133
);
133134
// Currently, DownsampleAction always acknowledges action was complete when no exceptions are thrown.
@@ -138,24 +139,35 @@ private void performDownsampleIndex(String indexName, String rollupIndexName, Ac
138139
);
139140
}
140141

142+
@Override
143+
public final StepKey getNextStepKey() {
144+
return downsampleFailed ? nextStepOnFailure : nextStepOnSuccess;
145+
}
146+
141147
public DateHistogramInterval getFixedInterval() {
142148
return fixedInterval;
143149
}
144150

145151
@Override
146152
public int hashCode() {
147-
return Objects.hash(super.hashCode(), fixedInterval);
153+
return Objects.hash(super.hashCode(), fixedInterval, nextStepOnSuccess, nextStepOnFailure);
148154
}
149155

150156
@Override
151157
public boolean equals(Object obj) {
158+
if (this == obj) {
159+
return true;
160+
}
152161
if (obj == null) {
153162
return false;
154163
}
155164
if (getClass() != obj.getClass()) {
156165
return false;
157166
}
158167
DownsampleStep other = (DownsampleStep) obj;
159-
return super.equals(obj) && Objects.equals(fixedInterval, other.fixedInterval);
168+
return super.equals(obj)
169+
&& Objects.equals(fixedInterval, other.fixedInterval)
170+
&& Objects.equals(nextStepOnSuccess, other.nextStepOnSuccess)
171+
&& Objects.equals(nextStepOnFailure, other.nextStepOnFailure);
160172
}
161173
}

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DownsampleActionTests.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,14 @@ public void testToSteps() {
7676

7777
assertTrue(steps.get(2) instanceof WaitForNoFollowersStep);
7878
assertThat(steps.get(2).getKey().name(), equalTo(WaitForNoFollowersStep.NAME));
79-
assertThat(steps.get(2).getNextStepKey().name(), equalTo(CleanupTargetIndexStep.NAME));
79+
assertThat(steps.get(2).getNextStepKey().name(), equalTo(ReadOnlyStep.NAME));
8080

81-
assertTrue(steps.get(3) instanceof CleanupTargetIndexStep);
82-
assertThat(steps.get(3).getKey().name(), equalTo(CleanupTargetIndexStep.NAME));
83-
assertThat(steps.get(3).getNextStepKey().name(), equalTo(ReadOnlyStep.NAME));
81+
assertTrue(steps.get(3) instanceof ReadOnlyStep);
82+
assertThat(steps.get(3).getKey().name(), equalTo(ReadOnlyStep.NAME));
83+
assertThat(steps.get(3).getNextStepKey().name(), equalTo(CleanupTargetIndexStep.NAME));
8484

85-
assertTrue(steps.get(4) instanceof ReadOnlyStep);
86-
assertThat(steps.get(4).getKey().name(), equalTo(ReadOnlyStep.NAME));
85+
assertTrue(steps.get(4) instanceof CleanupTargetIndexStep);
86+
assertThat(steps.get(4).getKey().name(), equalTo(CleanupTargetIndexStep.NAME));
8787
assertThat(steps.get(4).getNextStepKey().name(), equalTo(GENERATE_DOWNSAMPLE_STEP_NAME));
8888

8989
assertTrue(steps.get(5) instanceof GenerateUniqueIndexNameStep);

0 commit comments

Comments
 (0)