Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
* cluster state as custom metadata. When a snapshot is started by SLM, it is added to this set. Upon completion,
* is it removed. If a snapshot does not record its failure in SnapshotLifecycleStats, likely due to a master shutdown,
* it will not be removed from the registered set. A subsequent snapshot will then find that a registered snapshot
* is no longer running and will infer that it failed, updating SnapshotLifecycleStats accordingly.
* is no longer running and update SnapshotLifecycleStats based on the status of the snapshot.
*/
public class RegisteredPolicySnapshots implements Metadata.ProjectCustom {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.elasticsearch.action.admin.cluster.snapshots.restore.TransportRestoreSnapshotAction;
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.health.ClusterHealthStatus;
import org.elasticsearch.cluster.routing.UnassignedInfo;
Expand Down Expand Up @@ -72,6 +73,7 @@ public class SLMSnapshotBlockingIntegTests extends AbstractSnapshotIntegTestCase
private static final String NEVER_EXECUTE_CRON_SCHEDULE = "* * * 31 FEB ? *";

static final String REPO = "my-repo";
List<String> masterNodeNames = null;
List<String> dataNodeNames = null;

@Override
Expand All @@ -85,7 +87,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
@Before
public void ensureClusterNodes() {
logger.info("--> starting enough nodes to ensure we have enough to safely stop for tests");
internalCluster().startMasterOnlyNodes(2);
masterNodeNames = internalCluster().startMasterOnlyNodes(2);
dataNodeNames = internalCluster().startDataOnlyNodes(2);
ensureGreen();
}
Expand Down Expand Up @@ -329,6 +331,185 @@ public void testRetentionWithMultipleRepositories() throws Exception {
testUnsuccessfulSnapshotRetention(randomBoolean());
}

// Test that SLM stats and lastSuccess/lastFailure are correctly updated with master shutdown
public void testSLMWithMasterShutdown() throws Exception {
final String indexName = "test";
final String policyName = "test-policy";
int clusterSize = masterNodeNames.size() + dataNodeNames.size();
indexRandomDocs(indexName, 20);
createRepository(REPO, "mock");

createSnapshotPolicy(
policyName,
"snap",
NEVER_EXECUTE_CRON_SCHEDULE,
REPO,
indexName,
true,
false,
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null)
);

// block snapshot from completing
blockMasterFromFinalizingSnapshotOnIndexFile(REPO);

// first SLM execution
final String snapshotName = executePolicy(policyName);
final String initialMaster = internalCluster().getMasterName();
waitForBlock(initialMaster, REPO);

// restart master
internalCluster().restartNode(initialMaster);
ensureStableCluster(clusterSize);
awaitNoMoreRunningOperations();

// ensure snapshot is completed successfully after master failover
assertBusy(() -> {
final SnapshotInfo snapshotInfo;
try {
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
.setSnapshots(snapshotName)
.get();
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
} catch (SnapshotMissingException sme) {
throw new AssertionError(sme);
}
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
}, 30L, TimeUnit.SECONDS);
assertSnapshotSuccessful(snapshotName);

// the SLM policy metadata has not been updated due to master shutdown
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNull(policy.getLastSuccess());
assertNull(policy.getLastFailure());
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(0, policy.getPolicyStats().getSnapshotTakenCount());
});

// 2nd SLM execution, it should pick up the last missing stats
String snapshotSecond = executePolicy(policyName);

awaitNoMoreRunningOperations();
assertSnapshotSuccessful(snapshotSecond);

// stats should have 2 successful snapshots, 1 from the new snapshot and 1 from previous success
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNull(policy.getLastFailure());
assertNotNull(policy.getLastSuccess());
assertEquals(snapshotSecond, policy.getLastSuccess().getSnapshotName());
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(2, policy.getPolicyStats().getSnapshotTakenCount());
});
}

public void testSLMWithMasterShutdownAndDeletedSnapshot() throws Exception {
final String indexName = "test";
final String policyName = "test-policy";
int clusterSize = masterNodeNames.size() + dataNodeNames.size();
indexRandomDocs(indexName, 20);
createRepository(REPO, "mock");

createSnapshotPolicy(
policyName,
"snap",
NEVER_EXECUTE_CRON_SCHEDULE,
REPO,
indexName,
true,
false,
new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null)
);

// block snapshot from completing
blockMasterFromFinalizingSnapshotOnIndexFile(REPO);

// first SLM execution
final String snapshotName = executePolicy(policyName);
final String initialMaster = internalCluster().getMasterName();
waitForBlock(initialMaster, REPO);

// restart master
internalCluster().restartNode(initialMaster);
ensureStableCluster(clusterSize);
awaitNoMoreRunningOperations();

// ensure snapshot is completed successfully after master failover
assertBusy(() -> {
final SnapshotInfo snapshotInfo;
try {
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
.setSnapshots(snapshotName)
.get();
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
} catch (SnapshotMissingException sme) {
throw new AssertionError(sme);
}
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
}, 30L, TimeUnit.SECONDS);
assertSnapshotSuccessful(snapshotName);

// the SLM policy metadata has not been updated due to master shutdown
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNull(policy.getLastSuccess());
assertNull(policy.getLastFailure());
assertEquals(0, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(0, policy.getPolicyStats().getSnapshotTakenCount());
});

// delete the snapshot, simulate missing snapshot from repo
assertBusy(() -> {
AcknowledgedResponse response = clusterAdmin().prepareDeleteSnapshot(TEST_REQUEST_TIMEOUT, REPO, snapshotName).get();
assertTrue(response.isAcknowledged());
});

// 2nd SLM execution, it should pick up the last missing stats
String snapshotSecond = executePolicy(policyName);

awaitNoMoreRunningOperations();
assertSnapshotSuccessful(snapshotSecond);

// stats should have 1 successful and 1 failed snapshot, the deleted snapshot is inferred failure
assertBusy(() -> {
SnapshotLifecyclePolicyItem policy = client().execute(
GetSnapshotLifecycleAction.INSTANCE,
new GetSnapshotLifecycleAction.Request(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, policyName)
).get().getPolicies().getFirst();
assertNotNull(policy.getLastSuccess());
assertEquals(snapshotSecond, policy.getLastSuccess().getSnapshotName());
assertNotNull(policy.getLastFailure());
assertEquals(snapshotName, policy.getLastFailure().getSnapshotName());
assertEquals(1, policy.getPolicyStats().getSnapshotFailedCount());
assertEquals(1, policy.getPolicyStats().getSnapshotTakenCount());
});
}

private void assertSnapshotSuccessful(String snapshotName) throws Exception {
assertBusy(() -> {
final SnapshotInfo snapshotInfo;
try {
GetSnapshotsResponse snapshotsStatusResponse = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, REPO)
.setSnapshots(snapshotName)
.get();
snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
} catch (SnapshotMissingException sme) {
throw new AssertionError(sme);
}
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
});
}

private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Exception {
final String indexName = "test-idx";
final String policyId = "test-policy";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,17 @@ public static String getJobId(SnapshotLifecyclePolicyMetadata policyMeta) {
return policyMeta.getPolicy().getId() + "-" + policyMeta.getVersion();
}

/**
* Gets the policy name from a job id, which is expected to be in the format
* {@code <policyid>-<version>}. This method extracts the policy id by
* removing the version part (the last part after the last dash).
*/
public static String getPolicyId(String jobId) {
int lastDashIndex = jobId.lastIndexOf('-');
assert lastDashIndex != -1 : "Invalid job id format: " + jobId;
return jobId.substring(0, lastDashIndex);
}

/**
* Cancel all scheduled snapshot jobs
*/
Expand Down
Loading