Skip to content

Commit 85f7fa9

Browse files
authored
ILM: Force merge on zero-replica cloned index before snapshot (elastic#133954)
When performing a searchable snapshot action with force merge enabled, if the source index has one or more replicas, ILM now clones the index with zero replicas and performs the force merge on the clone. The snapshot is then taken from the force-merged clone instead of the source index, ensuring only primary shards are force-merged. The cloned index is deleted after the snapshot is mounted, and all references and step logic have been updated accordingly. Test coverage was added for the new flow, including handling retries and cleanup of failed clones. Key changes: - Execution state: Track the force-merged clone index in ILM state and propagate through relevant APIs. - SearchableSnapshotAction: Add conditional steps to clone the index with 0 replicas, force-merge, and delete the clone as needed. - Steps: Update ForceMerge, SegmentCount, Snapshot, and Delete steps to operate on the correct index (source or clone). - Tests/QA: Add and enhance tests to verify force-merge and snapshot behavior with and without replicas, including retry/cleanup paths and configuration for stable force-merges. Resolves elastic#75478
1 parent 3e6cf2d commit 85f7fa9

File tree

18 files changed

+641
-132
lines changed

18 files changed

+641
-132
lines changed

docs/changelog/133954.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 133954
2+
summary: "ILM: Force merge on zero-replica cloned index before snapshotting for searchable snapshots"
3+
area: ILM+SLM
4+
type: enhancement
5+
issues:
6+
- 75478

server/src/main/java/org/elasticsearch/cluster/metadata/LifecycleExecutionState.java

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ public record LifecycleExecutionState(
4040
String snapshotName,
4141
String shrinkIndexName,
4242
String snapshotIndexName,
43-
String downsampleIndexName
43+
String downsampleIndexName,
44+
String forceMergeCloneIndexName
4445
) {
4546

4647
public static final String ILM_CUSTOM_METADATA_KEY = "ilm";
@@ -64,6 +65,7 @@ public record LifecycleExecutionState(
6465
private static final String SNAPSHOT_INDEX_NAME = "snapshot_index_name";
6566
private static final String SHRINK_INDEX_NAME = "shrink_index_name";
6667
private static final String DOWNSAMPLE_INDEX_NAME = "rollup_index_name";
68+
private static final String FORCE_MERGE_CLONE_INDEX_NAME = "force_merge_clone_index_name";
6769

6870
public static final LifecycleExecutionState EMPTY_STATE = LifecycleExecutionState.builder().build();
6971

@@ -89,7 +91,8 @@ public static Builder builder(LifecycleExecutionState state) {
8991
.setShrinkIndexName(state.shrinkIndexName)
9092
.setSnapshotIndexName(state.snapshotIndexName)
9193
.setDownsampleIndexName(state.downsampleIndexName)
92-
.setStepTime(state.stepTime);
94+
.setStepTime(state.stepTime)
95+
.setForceMergeCloneIndexName(state.forceMergeCloneIndexName);
9396
}
9497

9598
public static LifecycleExecutionState fromCustomMetadata(Map<String, String> customData) {
@@ -202,6 +205,10 @@ public static LifecycleExecutionState fromCustomMetadata(Map<String, String> cus
202205
if (downsampleIndexName != null) {
203206
builder.setDownsampleIndexName(downsampleIndexName);
204207
}
208+
String forceMergeCloneIndexName = customData.get(FORCE_MERGE_CLONE_INDEX_NAME);
209+
if (forceMergeCloneIndexName != null) {
210+
builder.setForceMergeCloneIndexName(forceMergeCloneIndexName);
211+
}
205212
return builder.build();
206213
}
207214

@@ -274,6 +281,9 @@ public Map<String, String> asMap() {
274281
if (downsampleIndexName != null) {
275282
result.put(DOWNSAMPLE_INDEX_NAME, downsampleIndexName);
276283
}
284+
if (forceMergeCloneIndexName != null) {
285+
result.put(FORCE_MERGE_CLONE_INDEX_NAME, forceMergeCloneIndexName);
286+
}
277287
return Collections.unmodifiableMap(result);
278288
}
279289

@@ -307,6 +317,7 @@ public static class Builder {
307317
private String shrinkIndexName;
308318
private String snapshotIndexName;
309319
private String downsampleIndexName;
320+
private String forceMergeCloneIndexName;
310321

311322
public Builder setPhase(String phase) {
312323
this.phase = phase;
@@ -398,6 +409,11 @@ public Builder setDownsampleIndexName(String downsampleIndexName) {
398409
return this;
399410
}
400411

412+
public Builder setForceMergeCloneIndexName(String forceMergeCloneIndexName) {
413+
this.forceMergeCloneIndexName = forceMergeCloneIndexName;
414+
return this;
415+
}
416+
401417
public LifecycleExecutionState build() {
402418
return new LifecycleExecutionState(
403419
phase,
@@ -417,7 +433,8 @@ public LifecycleExecutionState build() {
417433
snapshotName,
418434
shrinkIndexName,
419435
snapshotIndexName,
420-
downsampleIndexName
436+
downsampleIndexName,
437+
forceMergeCloneIndexName
421438
);
422439
}
423440
}

test/framework/src/main/java/org/elasticsearch/test/rest/ESRestTestCase.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2087,6 +2087,10 @@ protected static void awaitIndexExists(String index, TimeValue timeout) throws I
20872087
ensureHealth(client(), index, request -> request.addParameter("timeout", timeout.toString()));
20882088
}
20892089

2090+
protected static void awaitIndexDoesNotExist(String index) throws Exception {
2091+
awaitIndexDoesNotExist(index, TimeValue.timeValueSeconds(10));
2092+
}
2093+
20902094
protected static void awaitIndexDoesNotExist(String index, TimeValue timeout) throws Exception {
20912095
assertBusy(() -> assertFalse(indexExists(index)), timeout.millis(), TimeUnit.MILLISECONDS);
20922096
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/CreateSnapshotStep.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,11 @@ void createSnapshot(ProjectId projectId, IndexMetadata indexMetadata, ActionList
101101
);
102102
return;
103103
}
104+
// If we performed the force merge step on the cloned index, we need to snapshot that index instead of the original.
105+
final String clonedIndexName = lifecycleState.forceMergeCloneIndexName();
106+
final String forceMergedIndexName = clonedIndexName != null ? clonedIndexName : indexName;
104107
CreateSnapshotRequest request = new CreateSnapshotRequest(TimeValue.MAX_VALUE, snapshotRepository, snapshotName);
105-
request.indices(indexName);
108+
request.indices(forceMergedIndexName);
106109
// this is safe as the snapshot creation will still be async, it's just that the listener will be notified when the snapshot is
107110
// complete
108111
request.waitForCompletion(true);
@@ -112,7 +115,7 @@ void createSnapshot(ProjectId projectId, IndexMetadata indexMetadata, ActionList
112115
logger.debug(
113116
"create snapshot response for policy [{}] and index [{}] is: {}",
114117
policyName,
115-
indexName,
118+
forceMergedIndexName,
116119
Strings.toString(response)
117120
);
118121
final SnapshotInfo snapInfo = response.getSnapshotInfo();
@@ -128,7 +131,7 @@ void createSnapshot(ProjectId projectId, IndexMetadata indexMetadata, ActionList
128131
snapshotRepository,
129132
snapshotName,
130133
policyName,
131-
indexName,
134+
forceMergedIndexName,
132135
snapInfo.failedShards(),
133136
snapInfo.totalShards()
134137
)

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DeleteStep.java

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,56 @@
1616
import org.elasticsearch.cluster.metadata.DataStream;
1717
import org.elasticsearch.cluster.metadata.IndexAbstraction;
1818
import org.elasticsearch.cluster.metadata.IndexMetadata;
19+
import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
1920
import org.elasticsearch.cluster.metadata.ProjectMetadata;
2021
import org.elasticsearch.common.Strings;
2122
import org.elasticsearch.core.TimeValue;
2223
import org.elasticsearch.index.Index;
2324

25+
import java.util.function.BiFunction;
26+
2427
/**
2528
* Deletes a single index.
2629
*/
2730
public class DeleteStep extends AsyncRetryDuringSnapshotActionStep {
31+
2832
public static final String NAME = "delete";
2933
private static final Logger logger = LogManager.getLogger(DeleteStep.class);
34+
private static final BiFunction<String, LifecycleExecutionState, String> DEFAULT_TARGET_INDEX_NAME_SUPPLIER = (
35+
indexName,
36+
lifecycleState) -> indexName;
37+
38+
private final BiFunction<String, LifecycleExecutionState, String> targetIndexNameSupplier;
39+
private final boolean indexSurvives;
3040

41+
/**
42+
* Use this constructor to delete the index that ILM is currently operating on.
43+
*/
3144
public DeleteStep(StepKey key, StepKey nextStepKey, Client client) {
45+
this(key, nextStepKey, client, DEFAULT_TARGET_INDEX_NAME_SUPPLIER, false);
46+
}
47+
48+
/**
49+
* Use this constructor to delete a specific index, potentially different from the one that ILM is currently operating on. The parameter
50+
* {@code indexSurvives} indicates whether the index that ILM runs on will survive (i.e. not get deleted) this step.
51+
* Look at the callers of {@link AsyncActionStep#indexSurvives()} for more details.
52+
*/
53+
public DeleteStep(
54+
StepKey key,
55+
StepKey nextStepKey,
56+
Client client,
57+
BiFunction<String, LifecycleExecutionState, String> targetIndexNameSupplier,
58+
boolean indexSurvives
59+
) {
3260
super(key, nextStepKey, client);
61+
this.targetIndexNameSupplier = targetIndexNameSupplier;
62+
this.indexSurvives = indexSurvives;
3363
}
3464

3565
@Override
3666
public void performDuringNoSnapshot(IndexMetadata indexMetadata, ProjectMetadata currentProject, ActionListener<Void> listener) {
3767
String policyName = indexMetadata.getLifecyclePolicyName();
38-
String indexName = indexMetadata.getIndex().getName();
68+
String indexName = targetIndexNameSupplier.apply(indexMetadata.getIndex().getName(), indexMetadata.getLifecycleExecutionState());
3969
IndexAbstraction indexAbstraction = currentProject.getIndicesLookup().get(indexName);
4070
assert indexAbstraction != null : "invalid cluster metadata. index [" + indexName + "] was not found";
4171
DataStream dataStream = indexAbstraction.getParentDataStream();
@@ -88,7 +118,7 @@ public void performDuringNoSnapshot(IndexMetadata indexMetadata, ProjectMetadata
88118

89119
@Override
90120
public boolean indexSurvives() {
91-
return false;
121+
return indexSurvives;
92122
}
93123

94124
@Override

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ForceMergeStep.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@
1616
import org.elasticsearch.cluster.ClusterStateObserver;
1717
import org.elasticsearch.cluster.ProjectState;
1818
import org.elasticsearch.cluster.metadata.IndexMetadata;
19+
import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
1920
import org.elasticsearch.common.Strings;
2021

2122
import java.util.Arrays;
2223
import java.util.Objects;
24+
import java.util.function.BiFunction;
2325

2426
/**
2527
* Invokes a force merge on a single index.
@@ -28,11 +30,33 @@ public class ForceMergeStep extends AsyncActionStep {
2830

2931
public static final String NAME = "forcemerge";
3032
private static final Logger logger = LogManager.getLogger(ForceMergeStep.class);
33+
private static final BiFunction<String, LifecycleExecutionState, String> DEFAULT_TARGET_INDEX_NAME_SUPPLIER = (
34+
indexName,
35+
lifecycleState) -> indexName;
36+
3137
private final int maxNumSegments;
38+
private final BiFunction<String, LifecycleExecutionState, String> targetIndexNameSupplier;
3239

40+
/**
41+
* Creates a new {@link ForceMergeStep} that will perform a force merge on the index that ILM is currently operating on.
42+
*/
3343
public ForceMergeStep(StepKey key, StepKey nextStepKey, Client client, int maxNumSegments) {
44+
this(key, nextStepKey, client, maxNumSegments, DEFAULT_TARGET_INDEX_NAME_SUPPLIER);
45+
}
46+
47+
/**
48+
* Creates a new {@link ForceMergeStep} that will perform a force merge on the index name returned by the supplier.
49+
*/
50+
public ForceMergeStep(
51+
StepKey key,
52+
StepKey nextStepKey,
53+
Client client,
54+
int maxNumSegments,
55+
BiFunction<String, LifecycleExecutionState, String> targetIndexNameSupplier
56+
) {
3457
super(key, nextStepKey, client);
3558
this.maxNumSegments = maxNumSegments;
59+
this.targetIndexNameSupplier = targetIndexNameSupplier;
3660
}
3761

3862
@Override
@@ -51,7 +75,8 @@ public void performAction(
5175
ClusterStateObserver observer,
5276
ActionListener<Void> listener
5377
) {
54-
String indexName = indexMetadata.getIndex().getName();
78+
String indexName = targetIndexNameSupplier.apply(indexMetadata.getIndex().getName(), indexMetadata.getLifecycleExecutionState());
79+
assert indexName != null : "target index name supplier must not return null";
5580
ForceMergeRequest request = new ForceMergeRequest(indexName);
5681
request.maxNumSegments(maxNumSegments);
5782
getClient(currentState.projectId()).admin().indices().forceMerge(request, listener.delegateFailureAndWrap((l, response) -> {

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/GenerateSnapshotNameStep.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import java.util.Locale;
2323
import java.util.Objects;
24+
import java.util.function.BiFunction;
2425

2526
/**
2627
* Generates a snapshot name for the given index and records it in the index metadata along with the provided snapshot repository.
@@ -35,10 +36,17 @@ public class GenerateSnapshotNameStep extends ClusterStateActionStep {
3536
private static final Logger logger = LogManager.getLogger(GenerateSnapshotNameStep.class);
3637

3738
private final String snapshotRepository;
38-
39-
public GenerateSnapshotNameStep(StepKey key, StepKey nextStepKey, String snapshotRepository) {
39+
private final BiFunction<String, LifecycleExecutionState, String> targetIndexNameSupplier;
40+
41+
public GenerateSnapshotNameStep(
42+
StepKey key,
43+
StepKey nextStepKey,
44+
String snapshotRepository,
45+
BiFunction<String, LifecycleExecutionState, String> targetIndexNameSupplier
46+
) {
4047
super(key, nextStepKey);
4148
this.snapshotRepository = snapshotRepository;
49+
this.targetIndexNameSupplier = targetIndexNameSupplier;
4250
}
4351

4452
public String getSnapshotRepository() {
@@ -72,9 +80,11 @@ public ProjectState performAction(Index index, ProjectState projectState) {
7280
+ "] cannot continue until the repository is created or the policy is changed"
7381
);
7482
}
83+
final String indexName = targetIndexNameSupplier.apply(index.getName(), lifecycleState);
84+
assert indexName != null : "target index name supplier must not return null";
7585

7686
LifecycleExecutionState.Builder newLifecycleState = LifecycleExecutionState.builder(lifecycleState);
77-
newLifecycleState.setSnapshotIndexName(index.getName());
87+
newLifecycleState.setSnapshotIndexName(indexName);
7888
newLifecycleState.setSnapshotRepository(snapshotRepository);
7989
if (lifecycleState.snapshotName() == null) {
8090
// generate and validate the snapshotName

0 commit comments

Comments
 (0)