Skip to content

Commit eacda49

Browse files
committed
WIP
1 parent 62c84a4 commit eacda49

File tree

3 files changed

+285
-4
lines changed

3 files changed

+285
-4
lines changed

x-pack/plugin/slm/src/internalClusterTest/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
import java.util.concurrent.atomic.AtomicReference;
5858

5959
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
60+
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
6061
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse;
6162
import static org.elasticsearch.xpack.slm.history.SnapshotHistoryStore.SLM_HISTORY_DATA_STREAM;
6263
import static org.hamcrest.Matchers.anyOf;
@@ -72,6 +73,7 @@ public class SLMSnapshotBlockingIntegTests extends AbstractSnapshotIntegTestCase
7273
private static final String NEVER_EXECUTE_CRON_SCHEDULE = "* * * 31 FEB ? *";
7374

7475
static final String REPO = "my-repo";
76+
List<String> masterNodeNames = null;
7577
List<String> dataNodeNames = null;
7678

7779
@Override
@@ -85,7 +87,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
8587
@Before
8688
public void ensureClusterNodes() {
8789
logger.info("--> starting enough nodes to ensure we have enough to safely stop for tests");
88-
internalCluster().startMasterOnlyNodes(2);
90+
masterNodeNames = internalCluster().startMasterOnlyNodes(2);
8991
dataNodeNames = internalCluster().startDataOnlyNodes(2);
9092
ensureGreen();
9193
}
@@ -329,6 +331,140 @@ public void testRetentionWithMultipleRepositories() throws Exception {
329331
testUnsuccessfulSnapshotRetention(randomBoolean());
330332
}
331333

334+
// Test that stats and lastsuccess is not updated when master failover during a snapshot, even if the snapshot is successful.
335+
public void testSLMWithMasterFailover() throws Exception {
336+
/**
337+
* set up:
338+
* create repo
339+
* create policy
340+
* create index
341+
*
342+
* execute:
343+
* block master from finalizing snapshot
344+
* execute policy
345+
* fail master
346+
* wait for new master
347+
* unblock snapshot
348+
* wait for snapshot to complete
349+
*
350+
* verify:
351+
* check SLM policy metadata, last success is not set
352+
*/
353+
final String indexName = "test";
354+
final String policyName = "test-policy";
355+
int clusterSize = masterNodeNames.size() + dataNodeNames.size();
356+
indexRandomDocs(indexName, 20);
357+
createRepository(REPO, "mock");
358+
359+
createSnapshotPolicy(
360+
policyName,
361+
"snap",
362+
NEVER_EXECUTE_CRON_SCHEDULE,
363+
REPO,
364+
indexName,
365+
true,
366+
false,
367+
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null)
368+
);
369+
370+
blockMasterFromFinalizingSnapshotOnIndexFile(REPO);
371+
372+
final String snapshotName = executePolicy(policyName);
373+
374+
final String initialMaster = internalCluster().getMasterName();
375+
waitForBlock(initialMaster, REPO);
376+
377+
internalCluster().restartNode(initialMaster);
378+
379+
ensureStableCluster(clusterSize);
380+
awaitNoMoreRunningOperations();
381+
382+
assertBusy(() -> {
383+
final SnapshotInfo snapshotInfo;
384+
try {
385+
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
386+
.setSnapshots(snapshotName)
387+
.get();
388+
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
389+
} catch (SnapshotMissingException sme) {
390+
throw new AssertionError(sme);
391+
}
392+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
393+
}, 30L, TimeUnit.SECONDS);
394+
395+
assertSnapshotSuccessful(snapshotName);
396+
397+
// test that the SLM policy metadata has not been updated, it should
398+
assertBusy(() -> {
399+
SnapshotLifecyclePolicyItem policy = client().execute(
400+
GetSnapshotLifecycleAction.INSTANCE,
401+
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
402+
).get().getPolicies().getFirst();
403+
assertNull(policy.getLastSuccess());
404+
assertNull(policy.getLastFailure());
405+
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
406+
assertEquals(0, policy.getPolicyStats().getSnapshotTakenCount());
407+
});
408+
409+
// execute the policy again, it should pick up the last missing stats
410+
String snapshotSecond = executePolicy(policyName);
411+
412+
awaitNoMoreRunningOperations();
413+
414+
assertSnapshotSuccessful(snapshotSecond);
415+
416+
// stats should have 2 successful snapshots, instead of 1 successful and 1 failed
417+
assertBusy(() -> {
418+
SnapshotLifecyclePolicyItem policy = client().execute(
419+
GetSnapshotLifecycleAction.INSTANCE,
420+
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
421+
).get().getPolicies().getFirst();
422+
assertNotNull(policy.getLastSuccess());
423+
assertNull(policy.getLastFailure());
424+
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
425+
assertEquals(2, policy.getPolicyStats().getSnapshotTakenCount());
426+
});
427+
//
428+
// final String snapshotName = executePolicy(policyName);
429+
// assertBusy(() -> {
430+
// final SnapshotInfo snapshotInfo;
431+
// try {
432+
// GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
433+
// .setSnapshots(snapshotName)
434+
// .get();
435+
// snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
436+
// } catch (SnapshotMissingException sme) {
437+
// throw new AssertionError(sme);
438+
// }
439+
// assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
440+
// }, 30L, TimeUnit.SECONDS);
441+
//
442+
// assertBusy(() -> {
443+
// SnapshotLifecyclePolicyItem policy = client().execute(
444+
// GetSnapshotLifecycleAction.INSTANCE,
445+
// new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
446+
// ).get().getPolicies().getFirst();
447+
// assertNotNull(policy.getLastSuccess());
448+
// assertNull(policy.getLastFailure());
449+
// }, 30L, TimeUnit.SECONDS);
450+
451+
}
452+
453+
private void assertSnapshotSuccessful(String snapshotName) throws Exception {
454+
assertBusy(() -> {
455+
final SnapshotInfo snapshotInfo;
456+
try {
457+
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
458+
.setSnapshots(snapshotName)
459+
.get();
460+
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
461+
} catch (SnapshotMissingException sme) {
462+
throw new AssertionError(sme);
463+
}
464+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
465+
}, 30L, TimeUnit.SECONDS);
466+
}
467+
332468
private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Exception {
333469
final String indexName = "test-idx";
334470
final String policyId = "test-policy";

x-pack/plugin/slm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleService.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,19 @@ public static String getJobId(SnapshotLifecyclePolicyMetadata policyMeta) {
232232
return policyMeta.getPolicy().getId() + "-" + policyMeta.getVersion();
233233
}
234234

235+
/**
236+
* Gets the policy name from a job id, which is expected to be in the format
237+
* {@code <policyid>-<version>}. This method extracts the policy id by
238+
* removing the version part (the last part after the last dash).
239+
*/
240+
public static String getPolicyId(String jobId) {
241+
int lastDashIndex = jobId.lastIndexOf('-');
242+
if (lastDashIndex == -1) {
243+
throw new IllegalArgumentException("Invalid job id format: " + jobId);
244+
}
245+
return jobId.substring(0, lastDashIndex);
246+
}
247+
235248
/**
236249
* Cancel all scheduled snapshot jobs
237250
*/

x-pack/plugin/slm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleTask.java

Lines changed: 135 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,31 @@
1313
import org.elasticsearch.action.ActionListener;
1414
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest;
1515
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse;
16+
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsRequest;
17+
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
18+
import org.elasticsearch.action.admin.cluster.snapshots.get.TransportGetSnapshotsAction;
19+
import org.elasticsearch.action.support.master.AcknowledgedRequest;
20+
import org.elasticsearch.action.support.master.AcknowledgedResponse;
1621
import org.elasticsearch.client.internal.Client;
22+
import org.elasticsearch.cluster.AckedBatchedClusterStateUpdateTask;
23+
import org.elasticsearch.cluster.AckedClusterStateUpdateTask;
1724
import org.elasticsearch.cluster.ClusterState;
25+
import org.elasticsearch.cluster.ClusterStateTaskExecutor;
1826
import org.elasticsearch.cluster.ClusterStateUpdateTask;
27+
import org.elasticsearch.cluster.ProjectState;
28+
import org.elasticsearch.cluster.SimpleBatchedExecutor;
1929
import org.elasticsearch.cluster.SnapshotsInProgress;
2030
import org.elasticsearch.cluster.metadata.ProjectId;
2131
import org.elasticsearch.cluster.metadata.ProjectMetadata;
2232
import org.elasticsearch.cluster.service.ClusterService;
33+
import org.elasticsearch.cluster.service.MasterServiceTaskQueue;
34+
import org.elasticsearch.common.Priority;
2335
import org.elasticsearch.common.Strings;
2436
import org.elasticsearch.common.scheduler.SchedulerEngine;
2537
import org.elasticsearch.core.FixForMultiProject;
2638
import org.elasticsearch.core.SuppressForbidden;
2739
import org.elasticsearch.core.TimeValue;
40+
import org.elasticsearch.core.Tuple;
2841
import org.elasticsearch.snapshots.RegisteredPolicySnapshots;
2942
import org.elasticsearch.snapshots.RegisteredPolicySnapshots.PolicySnapshot;
3043
import org.elasticsearch.snapshots.SnapshotException;
@@ -35,6 +48,7 @@
3548
import org.elasticsearch.xpack.core.ilm.LifecyclePolicySecurityClient;
3649
import org.elasticsearch.xpack.core.slm.SnapshotInvocationRecord;
3750
import org.elasticsearch.xpack.core.slm.SnapshotLifecycleMetadata;
51+
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
3852
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicyMetadata;
3953
import org.elasticsearch.xpack.core.slm.SnapshotLifecycleStats;
4054
import org.elasticsearch.xpack.slm.history.SnapshotHistoryItem;
@@ -53,6 +67,8 @@
5367

5468
import static org.elasticsearch.core.Strings.format;
5569
import static org.elasticsearch.xpack.core.ilm.LifecycleOperationMetadata.currentSLMMode;
70+
import static org.elasticsearch.xpack.slm.SnapshotLifecycleService.getJobId;
71+
import static org.elasticsearch.xpack.slm.SnapshotLifecycleService.getPolicyId;
5672

5773
public class SnapshotLifecycleTask implements SchedulerEngine.Listener {
5874

@@ -62,6 +78,7 @@ public class SnapshotLifecycleTask implements SchedulerEngine.Listener {
6278
private final Client client;
6379
private final ClusterService clusterService;
6480
private final SnapshotHistoryStore historyStore;
81+
private final MasterServiceTaskQueue<UpdatePolicyStatsTask> updatePolicyStatsQueue;
6582

6683
public SnapshotLifecycleTask(
6784
final ProjectId projectId,
@@ -73,13 +90,128 @@ public SnapshotLifecycleTask(
7390
this.client = client;
7491
this.clusterService = clusterService;
7592
this.historyStore = historyStore;
93+
94+
ClusterStateTaskExecutor<UpdatePolicyStatsTask> executor = new SimpleBatchedExecutor<>() {
95+
@Override
96+
public Tuple<ClusterState, Object> executeTask(UpdatePolicyStatsTask updatePolicyStatsTask, ClusterState clusterState) throws Exception {
97+
// TODO
98+
return null;
99+
}
100+
101+
@Override
102+
public void taskSucceeded(UpdatePolicyStatsTask updatePolicyStatsTask, Object o) {
103+
// TODO
104+
}
105+
};
106+
this.updatePolicyStatsQueue = clusterService.createTaskQueue("slm-update-policy-stats", Priority.HIGH, executor);
107+
}
108+
109+
static class UpdatePolicyStatsTask extends ClusterStateUpdateTask {
110+
111+
@Override
112+
public ClusterState execute(ClusterState currentState) throws Exception {
113+
return null;
114+
}
115+
116+
@Override
117+
public void onFailure(Exception e) {
118+
// TODO
119+
}
120+
}
121+
122+
private static List<String> getStaleRegisteredSnapshotIds(ProjectState projectState, String policyId) {
123+
Set<SnapshotId> runningSnapshots = currentlyRunningSnapshots(projectState.cluster());
124+
125+
RegisteredPolicySnapshots registeredSnapshots = projectState.metadata()
126+
.custom(RegisteredPolicySnapshots.TYPE, RegisteredPolicySnapshots.EMPTY);
127+
128+
List<String> staleRegisterSnapshotIds = registeredSnapshots.getSnapshots().stream()
129+
// look for snapshots of this SLM policy, leave the rest to the policy that owns it
130+
.filter(policySnapshot -> policySnapshot.getPolicy().equals(policyId))
131+
// look for snapshots that are no longer running
132+
.filter(policySnapshot -> runningSnapshots.contains(policySnapshot.getSnapshotId()) == false)
133+
.map(policySnapshot -> policySnapshot.getSnapshotId().getName())
134+
.toList();
135+
136+
return staleRegisterSnapshotIds;
76137
}
77138

78139
@Override
79140
public void triggered(SchedulerEngine.Event event) {
80141
logger.debug("snapshot lifecycle policy task triggered from job [{}]", event.jobName());
81-
ProjectMetadata projectMetadata = clusterService.state().getMetadata().getProject(projectId);
82-
final Optional<String> snapshotName = maybeTakeSnapshot(projectMetadata, event.jobName(), client, clusterService, historyStore);
142+
ProjectState projectState = clusterService.state().projectState(projectId);
143+
ProjectMetadata metadata = projectState.metadata();
144+
String policyId = getPolicyId(event.jobName());
145+
146+
List<String> snapshotsToCleanup = getStaleRegisteredSnapshotIds(projectState, policyId);
147+
if (snapshotsToCleanup.isEmpty() == false) {
148+
var policyMetadata = getSnapPolicyMetadata(metadata, event.jobName());
149+
if (policyMetadata.isEmpty()) {
150+
logger.warn("snapshot lifecycle policy for job [{}] no longer exists", event.jobName());
151+
return;
152+
}
153+
SnapshotLifecyclePolicy policy = policyMetadata.get().getPolicy();
154+
155+
GetSnapshotsRequest getSnapshotsRequest = new GetSnapshotsRequest(
156+
TimeValue.MAX_VALUE,
157+
new String[] { policy.getRepository() },
158+
snapshotsToCleanup.toArray(new String[0])
159+
);
160+
161+
GetSnapshotsResponse getSnapshotsResponse = client.admin().cluster()
162+
.execute(TransportGetSnapshotsAction.TYPE, getSnapshotsRequest).actionGet();
163+
164+
165+
// cluster update task
166+
// verify
167+
int countSnapshotFailure = 0;
168+
int countSnapshotSuccess = 0;
169+
SnapshotInfo lastSuccess = null;
170+
SnapshotInfo lastFailure = null;
171+
for (SnapshotInfo snapshotInfo : getSnapshotsResponse.getSnapshots()) {
172+
if (snapshotInfo.state() == null || snapshotInfo.state().completed() == false) {
173+
// skip unknown state and non-completed snapshots
174+
continue;
175+
}
176+
if (snapshotInfo.failedShards() == 0) {
177+
countSnapshotSuccess++;
178+
if (lastSuccess == null || snapshotInfo.startTime() > lastSuccess.startTime()) {
179+
lastSuccess = snapshotInfo;
180+
}
181+
} else {
182+
countSnapshotFailure++;
183+
if (lastFailure == null || snapshotInfo.startTime() > lastFailure.startTime()) {
184+
lastFailure = snapshotInfo;
185+
}
186+
}
187+
}
188+
189+
190+
// client.admin().cluster().getSnapshots(getSnapshotsRequest, new ActionListener<>() {
191+
//
192+
// @Override
193+
// public void onResponse(GetSnapshotsResponse response) {
194+
// int countSnapshotFailed = 0;
195+
// int countSnapshotSuccessful = 0;
196+
// for (SnapshotInfo snapshot : response.getSnapshots()) {
197+
// boolean success = snapshot.failedShards() == 0;
198+
// if (success) {
199+
// countSnapshotSuccessful++;
200+
// } else {
201+
// countSnapshotFailed++;
202+
// }
203+
// }
204+
//
205+
// }
206+
//
207+
// @Override
208+
// public void onFailure(Exception e) {
209+
//
210+
// }
211+
// });
212+
}
213+
214+
final Optional<String> snapshotName = maybeTakeSnapshot(metadata, event.jobName(), client, clusterService, historyStore);
83215

84216
// Would be cleaner if we could use Optional#ifPresentOrElse
85217
snapshotName.ifPresent(
@@ -219,7 +351,7 @@ static Optional<SnapshotLifecyclePolicyMetadata> getSnapPolicyMetadata(final Pro
219351
.flatMap(
220352
configMap -> configMap.values()
221353
.stream()
222-
.filter(policyMeta -> jobId.equals(SnapshotLifecycleService.getJobId(policyMeta)))
354+
.filter(policyMeta -> jobId.equals(getJobId(policyMeta)))
223355
.findFirst()
224356
);
225357
}

0 commit comments

Comments
 (0)