diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RestoreSnapshotIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RestoreSnapshotIT.java index 953cddba0ab7a..1dabfb5a0edd5 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RestoreSnapshotIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RestoreSnapshotIT.java @@ -12,6 +12,8 @@ import org.apache.logging.log4j.Level; import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.ActionRequestBuilder; +import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest; +import org.elasticsearch.action.admin.cluster.allocation.TransportClusterAllocationExplainAction; import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse; import org.elasticsearch.action.admin.indices.settings.get.GetSettingsResponse; import org.elasticsearch.action.admin.indices.template.get.GetIndexTemplatesResponse; @@ -20,6 +22,8 @@ import org.elasticsearch.cluster.block.ClusterBlocks; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.MappingMetadata; +import org.elasticsearch.cluster.routing.allocation.decider.Decision; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.core.TimeValue; @@ -41,6 +45,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -63,6 +68,7 @@ import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; +import static org.hamcrest.Matchers.startsWith; public class RestoreSnapshotIT extends AbstractSnapshotIntegTestCase { @@ -1025,4 +1031,75 @@ public void testNoWarningsOnRestoreOverClosedIndex() throws IllegalAccessExcepti mockLog.assertAllExpectationsMatched(); } } + + public void testExplainUnassigableDuringRestore() { + final String repoName = "repo-" + randomIdentifier(); + createRepository(repoName, FsRepository.TYPE); + final String indexName = "index-" + randomIdentifier(); + createIndexWithContent(indexName); + final String snapshotName = "snapshot-" + randomIdentifier(); + createSnapshot(repoName, snapshotName, List.of(indexName)); + assertAcked(indicesAdmin().prepareDelete(indexName)); + + final RestoreSnapshotResponse restoreSnapshotResponse = clusterAdmin().prepareRestoreSnapshot( + TEST_REQUEST_TIMEOUT, + repoName, + snapshotName + ) + .setIndices(indexName) + .setRestoreGlobalState(false) + .setWaitForCompletion(true) + .setIndexSettings( + Settings.builder().put(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_PREFIX + "._name", "not-a-node-" + randomIdentifier()) + ) + .get(); + + logger.info("--> restoreSnapshotResponse: {}", Strings.toString(restoreSnapshotResponse, true, true)); + assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), greaterThan(0)); + + final var clusterExplainResponse1 = client().execute( + TransportClusterAllocationExplainAction.TYPE, + new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(indexName).setShard(0).setPrimary(true) + ).actionGet(); + + logger.info("--> clusterExplainResponse1: {}", Strings.toString(clusterExplainResponse1, true, true)); + for (var nodeDecision : clusterExplainResponse1.getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getNodeDecisions()) { + assertEquals( + Set.of("restore_in_progress", "filter"), + nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet()) + ); + } + + updateIndexSettings(Settings.builder().putNull(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_PREFIX + "._name"), indexName); + + final var clusterExplainResponse2 = client().execute( + TransportClusterAllocationExplainAction.TYPE, + new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(indexName).setShard(0).setPrimary(true) + ).actionGet(); + + logger.info("--> clusterExplainResponse2: {}", Strings.toString(clusterExplainResponse2, true, true)); + for (var nodeDecision : clusterExplainResponse2.getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getNodeDecisions()) { + assertEquals( + Set.of("restore_in_progress"), + nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet()) + ); + assertEquals( + Set.of("restore_in_progress"), + nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet()) + ); + assertThat( + nodeDecision.getCanAllocateDecision().getDecisions().get(0).getExplanation(), + startsWith( + "Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. " + + "Please check constraints applied in index and cluster settings, then retry the restore." + ) + ); + } + } } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDecider.java index e945795bdb083..2d8170dbe4af1 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDecider.java @@ -13,7 +13,9 @@ import org.elasticsearch.cluster.routing.RecoverySource; import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.common.ReferenceDocs; /** * This {@link AllocationDecider} prevents shards that have failed to be @@ -49,15 +51,34 @@ public Decision canAllocate(final ShardRouting shardRouting, final RoutingAlloca return allocation.decision(Decision.YES, NAME, "shard is currently being restored"); } } - return allocation.decision( - Decision.NO, - NAME, - "shard has failed to be restored from the snapshot [%s] - manually close or delete the index [%s] in order to retry " - + "to restore the snapshot again or use the reroute API to force the allocation of an empty primary shard. Details: [%s]", - source.snapshot(), - shardRouting.getIndexName(), - shardRouting.unassignedInfo().details() - ); + + /** + * POST: the RestoreInProgress.Entry is non-existent. This section differentiates between a restore that failed + * because of a indexing fault (see {@link AllocationService.applyFailedShards}) or because of an allocation + * failure. + */ + UnassignedInfo unassignedInfo = shardRouting.unassignedInfo(); + if (unassignedInfo.failedAllocations() > 0) { + return allocation.decision( + Decision.NO, + NAME, + "shard has failed to be restored from the snapshot [%s] - manually close or delete the index [%s] in order to retry " + + "to restore the snapshot again or use the reroute API to force the allocation of an empty primary shard. Check the " + + "logs for more information about the failure. Details: [%s]", + source.snapshot(), + shardRouting.getIndexName(), + unassignedInfo.details() + ); + } else { + return allocation.decision( + Decision.NO, + NAME, + "Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. " + + "Please check constraints applied in index and cluster settings, then retry the restore. " + + "See [%s] for more details on using the allocation explain API.", + ReferenceDocs.ALLOCATION_EXPLAIN_API + ); + } } @Override diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDeciderTests.java index bc684fd0ea01c..22f78a1006274 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDeciderTests.java @@ -40,7 +40,6 @@ import static java.util.Collections.singletonList; import static org.elasticsearch.cluster.routing.RoutingNodesHelper.shardsWithState; -import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.startsWith; /** @@ -80,10 +79,9 @@ public void testCannotAllocatePrimaryMissingInRestoreInProgress() { assertEquals(Decision.Type.NO, decision.type()); assertThat( decision.getExplanation(), - equalTo( - "shard has failed to be restored from the snapshot [default:_repository:_missing/_uuid] - manually close or " - + "delete the index [test] in order to retry to restore the snapshot again or use the reroute API " - + "to force the allocation of an empty primary shard. Details: [restore_source[_repository/_missing]]" + startsWith( + "Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. " + + "Please check constraints applied in index and cluster settings, then retry the restore." ) ); } @@ -105,17 +103,28 @@ public void testCanAllocatePrimaryExistingInRestoreInProgress() { routingTable = clusterState.routingTable(); final RestoreInProgress.State shardState; + final int failureCount; if (randomBoolean()) { shardState = randomFrom(RestoreInProgress.State.STARTED, RestoreInProgress.State.INIT); + failureCount = 0; } else { shardState = RestoreInProgress.State.FAILURE; UnassignedInfo currentInfo = primary.unassignedInfo(); + UnassignedInfo.Reason reason; + if (randomBoolean()) { + failureCount = randomBoolean() ? 0 : 1; + reason = UnassignedInfo.Reason.ALLOCATION_FAILED; + } else { + failureCount = 0; + reason = currentInfo.reason(); + } + UnassignedInfo newInfo = new UnassignedInfo( - currentInfo.reason(), + reason, currentInfo.message(), new IOException("i/o failure"), - currentInfo.failedAllocations(), + failureCount, currentInfo.unassignedTimeNanos(), currentInfo.unassignedTimeMillis(), currentInfo.delayed(), @@ -165,16 +174,25 @@ public void testCanAllocatePrimaryExistingInRestoreInProgress() { Decision decision = executeAllocation(clusterState, primary); if (shardState == RestoreInProgress.State.FAILURE) { assertEquals(Decision.Type.NO, decision.type()); - assertThat( - decision.getExplanation(), - startsWith( - "shard has failed to be restored from the snapshot [default:_repository:_existing/_uuid]" - + " - manually close or delete the index " - + "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of " - + "an empty primary shard. Details: [restore_source[_repository/_existing], failure " - + "java.io.IOException: i/o failure" - ) - ); + if (failureCount > 0) { + assertThat( + decision.getExplanation(), + startsWith( + "shard has failed to be restored from the snapshot [default:_repository:_existing/_uuid]" + + " - manually close or delete the index " + + "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of " + + "an empty primary shard. Check the logs for more information about the failure. Details:" + ) + ); + } else { + assertThat( + decision.getExplanation(), + startsWith( + "Restore from snapshot failed because the configured constraints prevented allocation on any of the available " + + "nodes. Please check constraints applied in index and cluster settings, then retry the restore." + ) + ); + } } else { assertEquals(Decision.Type.YES, decision.type()); assertEquals("shard is currently being restored", decision.getExplanation());