Skip to content

Commit aea67ba

Browse files
authored
Correctly update SLM stats in case of master shutdown (elastic#134152)
1 parent c66eb2e commit aea67ba

File tree

5 files changed

+564
-61
lines changed

5 files changed

+564
-61
lines changed

docs/changelog/134152.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 134152
2+
summary: Correctly update SLM stats with master shutdown
3+
area: ILM+SLM
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/snapshots/RegisteredPolicySnapshots.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
* cluster state as custom metadata. When a snapshot is started by SLM, it is added to this set. Upon completion,
4545
* is it removed. If a snapshot does not record its failure in SnapshotLifecycleStats, likely due to a master shutdown,
4646
* it will not be removed from the registered set. A subsequent snapshot will then find that a registered snapshot
47-
* is no longer running and will infer that it failed, updating SnapshotLifecycleStats accordingly.
47+
* is no longer running and update SnapshotLifecycleStats based on the status of the snapshot.
4848
*/
4949
public class RegisteredPolicySnapshots implements Metadata.ProjectCustom {
5050

x-pack/plugin/slm/src/internalClusterTest/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

Lines changed: 182 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.elasticsearch.action.admin.cluster.snapshots.restore.TransportRestoreSnapshotAction;
1616
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
1717
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
18+
import org.elasticsearch.action.support.master.AcknowledgedResponse;
1819
import org.elasticsearch.cluster.SnapshotsInProgress;
1920
import org.elasticsearch.cluster.health.ClusterHealthStatus;
2021
import org.elasticsearch.cluster.routing.UnassignedInfo;
@@ -72,6 +73,7 @@ public class SLMSnapshotBlockingIntegTests extends AbstractSnapshotIntegTestCase
7273
private static final String NEVER_EXECUTE_CRON_SCHEDULE = "* * * 31 FEB ? *";
7374

7475
static final String REPO = "my-repo";
76+
List<String> masterNodeNames = null;
7577
List<String> dataNodeNames = null;
7678

7779
@Override
@@ -85,7 +87,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
8587
@Before
8688
public void ensureClusterNodes() {
8789
logger.info("--> starting enough nodes to ensure we have enough to safely stop for tests");
88-
internalCluster().startMasterOnlyNodes(2);
90+
masterNodeNames = internalCluster().startMasterOnlyNodes(2);
8991
dataNodeNames = internalCluster().startDataOnlyNodes(2);
9092
ensureGreen();
9193
}
@@ -329,6 +331,185 @@ public void testRetentionWithMultipleRepositories() throws Exception {
329331
testUnsuccessfulSnapshotRetention(randomBoolean());
330332
}
331333

334+
// Test that SLM stats and lastSuccess/lastFailure are correctly updated with master shutdown
335+
public void testSLMWithMasterShutdown() throws Exception {
336+
final String indexName = "test";
337+
final String policyName = "test-policy";
338+
int clusterSize = masterNodeNames.size() + dataNodeNames.size();
339+
indexRandomDocs(indexName, 20);
340+
createRepository(REPO, "mock");
341+
342+
createSnapshotPolicy(
343+
policyName,
344+
"snap",
345+
NEVER_EXECUTE_CRON_SCHEDULE,
346+
REPO,
347+
indexName,
348+
true,
349+
false,
350+
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null)
351+
);
352+
353+
// block snapshot from completing
354+
blockMasterFromFinalizingSnapshotOnIndexFile(REPO);
355+
356+
// first SLM execution
357+
final String snapshotName = executePolicy(policyName);
358+
final String initialMaster = internalCluster().getMasterName();
359+
waitForBlock(initialMaster, REPO);
360+
361+
// restart master
362+
internalCluster().restartNode(initialMaster);
363+
ensureStableCluster(clusterSize);
364+
awaitNoMoreRunningOperations();
365+
366+
// ensure snapshot is completed successfully after master failover
367+
assertBusy(() -> {
368+
final SnapshotInfo snapshotInfo;
369+
try {
370+
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
371+
.setSnapshots(snapshotName)
372+
.get();
373+
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
374+
} catch (SnapshotMissingException sme) {
375+
throw new AssertionError(sme);
376+
}
377+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
378+
}, 30L, TimeUnit.SECONDS);
379+
assertSnapshotSuccessful(snapshotName);
380+
381+
// the SLM policy metadata has not been updated due to master shutdown
382+
assertBusy(() -> {
383+
SnapshotLifecyclePolicyItem policy = client().execute(
384+
GetSnapshotLifecycleAction.INSTANCE,
385+
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
386+
).get().getPolicies().getFirst();
387+
assertNull(policy.getLastSuccess());
388+
assertNull(policy.getLastFailure());
389+
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
390+
assertEquals(0, policy.getPolicyStats().getSnapshotTakenCount());
391+
});
392+
393+
// 2nd SLM execution, it should pick up the last missing stats
394+
String snapshotSecond = executePolicy(policyName);
395+
396+
awaitNoMoreRunningOperations();
397+
assertSnapshotSuccessful(snapshotSecond);
398+
399+
// stats should have 2 successful snapshots, 1 from the new snapshot and 1 from previous success
400+
assertBusy(() -> {
401+
SnapshotLifecyclePolicyItem policy = client().execute(
402+
GetSnapshotLifecycleAction.INSTANCE,
403+
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
404+
).get().getPolicies().getFirst();
405+
assertNull(policy.getLastFailure());
406+
assertNotNull(policy.getLastSuccess());
407+
assertEquals(snapshotSecond, policy.getLastSuccess().getSnapshotName());
408+
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
409+
assertEquals(2, policy.getPolicyStats().getSnapshotTakenCount());
410+
});
411+
}
412+
413+
public void testSLMWithMasterShutdownAndDeletedSnapshot() throws Exception {
414+
final String indexName = "test";
415+
final String policyName = "test-policy";
416+
int clusterSize = masterNodeNames.size() + dataNodeNames.size();
417+
indexRandomDocs(indexName, 20);
418+
createRepository(REPO, "mock");
419+
420+
createSnapshotPolicy(
421+
policyName,
422+
"snap",
423+
NEVER_EXECUTE_CRON_SCHEDULE,
424+
REPO,
425+
indexName,
426+
true,
427+
false,
428+
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null)
429+
);
430+
431+
// block snapshot from completing
432+
blockMasterFromFinalizingSnapshotOnIndexFile(REPO);
433+
434+
// first SLM execution
435+
final String snapshotName = executePolicy(policyName);
436+
final String initialMaster = internalCluster().getMasterName();
437+
waitForBlock(initialMaster, REPO);
438+
439+
// restart master
440+
internalCluster().restartNode(initialMaster);
441+
ensureStableCluster(clusterSize);
442+
awaitNoMoreRunningOperations();
443+
444+
// ensure snapshot is completed successfully after master failover
445+
assertBusy(() -> {
446+
final SnapshotInfo snapshotInfo;
447+
try {
448+
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
449+
.setSnapshots(snapshotName)
450+
.get();
451+
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
452+
} catch (SnapshotMissingException sme) {
453+
throw new AssertionError(sme);
454+
}
455+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
456+
}, 30L, TimeUnit.SECONDS);
457+
assertSnapshotSuccessful(snapshotName);
458+
459+
// the SLM policy metadata has not been updated due to master shutdown
460+
assertBusy(() -> {
461+
SnapshotLifecyclePolicyItem policy = client().execute(
462+
GetSnapshotLifecycleAction.INSTANCE,
463+
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
464+
).get().getPolicies().getFirst();
465+
assertNull(policy.getLastSuccess());
466+
assertNull(policy.getLastFailure());
467+
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
468+
assertEquals(0, policy.getPolicyStats().getSnapshotTakenCount());
469+
});
470+
471+
// delete the snapshot, simulate missing snapshot from repo
472+
assertBusy(() -> {
473+
AcknowledgedResponse response = clusterAdmin().prepareDeleteSnapshot(TEST_REQUEST_TIMEOUT, REPO, snapshotName).get();
474+
assertTrue(response.isAcknowledged());
475+
});
476+
477+
// 2nd SLM execution, it should pick up the last missing stats
478+
String snapshotSecond = executePolicy(policyName);
479+
480+
awaitNoMoreRunningOperations();
481+
assertSnapshotSuccessful(snapshotSecond);
482+
483+
// stats should have 1 successful and 1 failed snapshot, the deleted snapshot is inferred failure
484+
assertBusy(() -> {
485+
SnapshotLifecyclePolicyItem policy = client().execute(
486+
GetSnapshotLifecycleAction.INSTANCE,
487+
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
488+
).get().getPolicies().getFirst();
489+
assertNotNull(policy.getLastSuccess());
490+
assertEquals(snapshotSecond, policy.getLastSuccess().getSnapshotName());
491+
assertNotNull(policy.getLastFailure());
492+
assertEquals(snapshotName, policy.getLastFailure().getSnapshotName());
493+
assertEquals(1, policy.getPolicyStats().getSnapshotFailedCount());
494+
assertEquals(1, policy.getPolicyStats().getSnapshotTakenCount());
495+
});
496+
}
497+
498+
private void assertSnapshotSuccessful(String snapshotName) throws Exception {
499+
assertBusy(() -> {
500+
final SnapshotInfo snapshotInfo;
501+
try {
502+
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
503+
.setSnapshots(snapshotName)
504+
.get();
505+
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
506+
} catch (SnapshotMissingException sme) {
507+
throw new AssertionError(sme);
508+
}
509+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
510+
});
511+
}
512+
332513
private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Exception {
333514
final String indexName = "test-idx";
334515
final String policyId = "test-policy";

0 commit comments

Comments
 (0)