Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import org.apache.logging.log4j.Level;
import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.ActionRequestBuilder;
import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest;
import org.elasticsearch.action.admin.cluster.allocation.TransportClusterAllocationExplainAction;
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
import org.elasticsearch.action.admin.indices.settings.get.GetSettingsResponse;
import org.elasticsearch.action.admin.indices.template.get.GetIndexTemplatesResponse;
Expand All @@ -20,6 +22,8 @@
import org.elasticsearch.cluster.block.ClusterBlocks;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.MappingMetadata;
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.core.TimeValue;
Expand All @@ -41,6 +45,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
Expand All @@ -63,6 +68,7 @@
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.hamcrest.Matchers.startsWith;

public class RestoreSnapshotIT extends AbstractSnapshotIntegTestCase {

Expand Down Expand Up @@ -1025,4 +1031,75 @@ public void testNoWarningsOnRestoreOverClosedIndex() throws IllegalAccessExcepti
mockLog.assertAllExpectationsMatched();
}
}

public void testExplainUnassigableDuringRestore() {
final String repoName = "repo-" + randomIdentifier();
createRepository(repoName, FsRepository.TYPE);
final String indexName = "index-" + randomIdentifier();
createIndexWithContent(indexName);
final String snapshotName = "snapshot-" + randomIdentifier();
createSnapshot(repoName, snapshotName, List.of(indexName));
assertAcked(indicesAdmin().prepareDelete(indexName));

final RestoreSnapshotResponse restoreSnapshotResponse = clusterAdmin().prepareRestoreSnapshot(
TEST_REQUEST_TIMEOUT,
repoName,
snapshotName
)
.setIndices(indexName)
.setRestoreGlobalState(false)
.setWaitForCompletion(true)
.setIndexSettings(
Settings.builder().put(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_PREFIX + "._name", "not-a-node-" + randomIdentifier())
)
.get();

logger.info("--> restoreSnapshotResponse: {}", Strings.toString(restoreSnapshotResponse, true, true));
assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), greaterThan(0));

final var clusterExplainResponse1 = client().execute(
TransportClusterAllocationExplainAction.TYPE,
new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(indexName).setShard(0).setPrimary(true)
).actionGet();

logger.info("--> clusterExplainResponse1: {}", Strings.toString(clusterExplainResponse1, true, true));
for (var nodeDecision : clusterExplainResponse1.getExplanation()
.getShardAllocationDecision()
.getAllocateDecision()
.getNodeDecisions()) {
assertEquals(
Set.of("restore_in_progress", "filter"),
nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet())
);
}

updateIndexSettings(Settings.builder().putNull(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_PREFIX + "._name"), indexName);

final var clusterExplainResponse2 = client().execute(
TransportClusterAllocationExplainAction.TYPE,
new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(indexName).setShard(0).setPrimary(true)
).actionGet();

logger.info("--> clusterExplainResponse2: {}", Strings.toString(clusterExplainResponse2, true, true));
for (var nodeDecision : clusterExplainResponse2.getExplanation()
.getShardAllocationDecision()
.getAllocateDecision()
.getNodeDecisions()) {
assertEquals(
Set.of("restore_in_progress"),
nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet())
);
assertEquals(
Set.of("restore_in_progress"),
nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet())
);
assertThat(
nodeDecision.getCanAllocateDecision().getDecisions().get(0).getExplanation(),
startsWith(
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. "
+ "Please check constraints applied in index and cluster settings, then retry the restore."
)
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
import org.elasticsearch.cluster.routing.RecoverySource;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.ReferenceDocs;

/**
* This {@link AllocationDecider} prevents shards that have failed to be
Expand Down Expand Up @@ -49,15 +51,34 @@ public Decision canAllocate(final ShardRouting shardRouting, final RoutingAlloca
return allocation.decision(Decision.YES, NAME, "shard is currently being restored");
}
}
return allocation.decision(
Decision.NO,
NAME,
"shard has failed to be restored from the snapshot [%s] - manually close or delete the index [%s] in order to retry "
+ "to restore the snapshot again or use the reroute API to force the allocation of an empty primary shard. Details: [%s]",
source.snapshot(),
shardRouting.getIndexName(),
shardRouting.unassignedInfo().details()
);

/**
* POST: the RestoreInProgress.Entry is non-existent. This section differentiates between a restore that failed
* because of a indexing fault (see {@link AllocationService.applyFailedShards}) or because of an allocation
* failure.
*/
UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
if (unassignedInfo.failedAllocations() > 0) {
Comment on lines +60 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looking good, matches with what David said in the #100233

return allocation.decision(
Decision.NO,
NAME,
"shard has failed to be restored from the snapshot [%s] - manually close or delete the index [%s] in order to retry "
+ "to restore the snapshot again or use the reroute API to force the allocation of an empty primary shard. Check the "
+ "logs for more information about the failure. Details: [%s]",
source.snapshot(),
shardRouting.getIndexName(),
unassignedInfo.details()
);
} else {
return allocation.decision(
Decision.NO,
NAME,
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. "
+ "Please check constraints applied in index and cluster settings, then retry the restore. "
+ "See [%s] for more details on using the allocation explain API.",
ReferenceDocs.ALLOCATION_EXPLAIN_API
);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@

import static java.util.Collections.singletonList;
import static org.elasticsearch.cluster.routing.RoutingNodesHelper.shardsWithState;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.startsWith;

/**
Expand Down Expand Up @@ -80,10 +79,9 @@ public void testCannotAllocatePrimaryMissingInRestoreInProgress() {
assertEquals(Decision.Type.NO, decision.type());
assertThat(
decision.getExplanation(),
equalTo(
"shard has failed to be restored from the snapshot [default:_repository:_missing/_uuid] - manually close or "
+ "delete the index [test] in order to retry to restore the snapshot again or use the reroute API "
+ "to force the allocation of an empty primary shard. Details: [restore_source[_repository/_missing]]"
startsWith(
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. "
+ "Please check constraints applied in index and cluster settings, then retry the restore."
)
);
}
Expand All @@ -105,17 +103,28 @@ public void testCanAllocatePrimaryExistingInRestoreInProgress() {
routingTable = clusterState.routingTable();

final RestoreInProgress.State shardState;
final int failureCount;
if (randomBoolean()) {
shardState = randomFrom(RestoreInProgress.State.STARTED, RestoreInProgress.State.INIT);
failureCount = 0;
} else {
shardState = RestoreInProgress.State.FAILURE;

UnassignedInfo currentInfo = primary.unassignedInfo();
UnassignedInfo.Reason reason;
if (randomBoolean()) {
failureCount = randomBoolean() ? 0 : 1;
reason = UnassignedInfo.Reason.ALLOCATION_FAILED;
} else {
failureCount = 0;
reason = currentInfo.reason();
}

UnassignedInfo newInfo = new UnassignedInfo(
currentInfo.reason(),
reason,
currentInfo.message(),
new IOException("i/o failure"),
currentInfo.failedAllocations(),
failureCount,
currentInfo.unassignedTimeNanos(),
currentInfo.unassignedTimeMillis(),
currentInfo.delayed(),
Expand Down Expand Up @@ -165,16 +174,25 @@ public void testCanAllocatePrimaryExistingInRestoreInProgress() {
Decision decision = executeAllocation(clusterState, primary);
if (shardState == RestoreInProgress.State.FAILURE) {
assertEquals(Decision.Type.NO, decision.type());
assertThat(
decision.getExplanation(),
startsWith(
"shard has failed to be restored from the snapshot [default:_repository:_existing/_uuid]"
+ " - manually close or delete the index "
+ "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of "
+ "an empty primary shard. Details: [restore_source[_repository/_existing], failure "
+ "java.io.IOException: i/o failure"
)
);
if (failureCount > 0) {
assertThat(
decision.getExplanation(),
startsWith(
"shard has failed to be restored from the snapshot [default:_repository:_existing/_uuid]"
+ " - manually close or delete the index "
+ "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of "
+ "an empty primary shard. Check the logs for more information about the failure. Details:"
)
);
} else {
assertThat(
decision.getExplanation(),
startsWith(
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available "
+ "nodes. Please check constraints applied in index and cluster settings, then retry the restore."
)
);
}
} else {
assertEquals(Decision.Type.YES, decision.type());
assertEquals("shard is currently being restored", decision.getExplanation());
Expand Down