Skip to content

Commit 2787546

Browse files
authored
allocation: clarify RestoreInProgressAllocationDecider failure message (#132307)
The RestoreInProgressAllocationDecider can issue a grave message about shard restoration failure, when declining a shard allocation. Sometimes, this is because of restoration failure, but sometimes it is because another decider has declined the allocation in a previous round. This change adds a check of UnassignedInfo's allocation failure count to make this message appropriate. Fixes ES-11809
1 parent a229fd2 commit 2787546

File tree

3 files changed

+142
-26
lines changed

3 files changed

+142
-26
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/RestoreSnapshotIT.java

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import org.apache.logging.log4j.Level;
1313
import org.elasticsearch.action.ActionFuture;
1414
import org.elasticsearch.action.ActionRequestBuilder;
15+
import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest;
16+
import org.elasticsearch.action.admin.cluster.allocation.TransportClusterAllocationExplainAction;
1517
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
1618
import org.elasticsearch.action.admin.indices.settings.get.GetSettingsResponse;
1719
import org.elasticsearch.action.admin.indices.template.get.GetIndexTemplatesResponse;
@@ -20,6 +22,8 @@
2022
import org.elasticsearch.cluster.block.ClusterBlocks;
2123
import org.elasticsearch.cluster.metadata.IndexMetadata;
2224
import org.elasticsearch.cluster.metadata.MappingMetadata;
25+
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
26+
import org.elasticsearch.common.Strings;
2327
import org.elasticsearch.common.settings.Settings;
2428
import org.elasticsearch.common.unit.ByteSizeUnit;
2529
import org.elasticsearch.core.TimeValue;
@@ -41,6 +45,7 @@
4145
import java.util.List;
4246
import java.util.Locale;
4347
import java.util.Map;
48+
import java.util.Set;
4449
import java.util.concurrent.TimeUnit;
4550
import java.util.stream.Collectors;
4651
import java.util.stream.IntStream;
@@ -63,6 +68,7 @@
6368
import static org.hamcrest.Matchers.not;
6469
import static org.hamcrest.Matchers.notNullValue;
6570
import static org.hamcrest.Matchers.nullValue;
71+
import static org.hamcrest.Matchers.startsWith;
6672

6773
public class RestoreSnapshotIT extends AbstractSnapshotIntegTestCase {
6874

@@ -1025,4 +1031,75 @@ public void testNoWarningsOnRestoreOverClosedIndex() throws IllegalAccessExcepti
10251031
mockLog.assertAllExpectationsMatched();
10261032
}
10271033
}
1034+
1035+
public void testExplainUnassigableDuringRestore() {
1036+
final String repoName = "repo-" + randomIdentifier();
1037+
createRepository(repoName, FsRepository.TYPE);
1038+
final String indexName = "index-" + randomIdentifier();
1039+
createIndexWithContent(indexName);
1040+
final String snapshotName = "snapshot-" + randomIdentifier();
1041+
createSnapshot(repoName, snapshotName, List.of(indexName));
1042+
assertAcked(indicesAdmin().prepareDelete(indexName));
1043+
1044+
final RestoreSnapshotResponse restoreSnapshotResponse = clusterAdmin().prepareRestoreSnapshot(
1045+
TEST_REQUEST_TIMEOUT,
1046+
repoName,
1047+
snapshotName
1048+
)
1049+
.setIndices(indexName)
1050+
.setRestoreGlobalState(false)
1051+
.setWaitForCompletion(true)
1052+
.setIndexSettings(
1053+
Settings.builder().put(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_PREFIX + "._name", "not-a-node-" + randomIdentifier())
1054+
)
1055+
.get();
1056+
1057+
logger.info("--> restoreSnapshotResponse: {}", Strings.toString(restoreSnapshotResponse, true, true));
1058+
assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), greaterThan(0));
1059+
1060+
final var clusterExplainResponse1 = client().execute(
1061+
TransportClusterAllocationExplainAction.TYPE,
1062+
new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(indexName).setShard(0).setPrimary(true)
1063+
).actionGet();
1064+
1065+
logger.info("--> clusterExplainResponse1: {}", Strings.toString(clusterExplainResponse1, true, true));
1066+
for (var nodeDecision : clusterExplainResponse1.getExplanation()
1067+
.getShardAllocationDecision()
1068+
.getAllocateDecision()
1069+
.getNodeDecisions()) {
1070+
assertEquals(
1071+
Set.of("restore_in_progress", "filter"),
1072+
nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet())
1073+
);
1074+
}
1075+
1076+
updateIndexSettings(Settings.builder().putNull(IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_PREFIX + "._name"), indexName);
1077+
1078+
final var clusterExplainResponse2 = client().execute(
1079+
TransportClusterAllocationExplainAction.TYPE,
1080+
new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(indexName).setShard(0).setPrimary(true)
1081+
).actionGet();
1082+
1083+
logger.info("--> clusterExplainResponse2: {}", Strings.toString(clusterExplainResponse2, true, true));
1084+
for (var nodeDecision : clusterExplainResponse2.getExplanation()
1085+
.getShardAllocationDecision()
1086+
.getAllocateDecision()
1087+
.getNodeDecisions()) {
1088+
assertEquals(
1089+
Set.of("restore_in_progress"),
1090+
nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet())
1091+
);
1092+
assertEquals(
1093+
Set.of("restore_in_progress"),
1094+
nodeDecision.getCanAllocateDecision().getDecisions().stream().map(Decision::label).collect(Collectors.toSet())
1095+
);
1096+
assertThat(
1097+
nodeDecision.getCanAllocateDecision().getDecisions().get(0).getExplanation(),
1098+
startsWith(
1099+
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. "
1100+
+ "Please check constraints applied in index and cluster settings, then retry the restore."
1101+
)
1102+
);
1103+
}
1104+
}
10281105
}

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDecider.java

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
import org.elasticsearch.cluster.routing.RecoverySource;
1414
import org.elasticsearch.cluster.routing.RoutingNode;
1515
import org.elasticsearch.cluster.routing.ShardRouting;
16+
import org.elasticsearch.cluster.routing.UnassignedInfo;
1617
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
18+
import org.elasticsearch.common.ReferenceDocs;
1719

1820
/**
1921
* This {@link AllocationDecider} prevents shards that have failed to be
@@ -49,15 +51,34 @@ public Decision canAllocate(final ShardRouting shardRouting, final RoutingAlloca
4951
return allocation.decision(Decision.YES, NAME, "shard is currently being restored");
5052
}
5153
}
52-
return allocation.decision(
53-
Decision.NO,
54-
NAME,
55-
"shard has failed to be restored from the snapshot [%s] - manually close or delete the index [%s] in order to retry "
56-
+ "to restore the snapshot again or use the reroute API to force the allocation of an empty primary shard. Details: [%s]",
57-
source.snapshot(),
58-
shardRouting.getIndexName(),
59-
shardRouting.unassignedInfo().details()
60-
);
54+
55+
/**
56+
* POST: the RestoreInProgress.Entry is non-existent. This section differentiates between a restore that failed
57+
* because of a indexing fault (see {@link AllocationService.applyFailedShards}) or because of an allocation
58+
* failure.
59+
*/
60+
UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
61+
if (unassignedInfo.failedAllocations() > 0) {
62+
return allocation.decision(
63+
Decision.NO,
64+
NAME,
65+
"shard has failed to be restored from the snapshot [%s] - manually close or delete the index [%s] in order to retry "
66+
+ "to restore the snapshot again or use the reroute API to force the allocation of an empty primary shard. Check the "
67+
+ "logs for more information about the failure. Details: [%s]",
68+
source.snapshot(),
69+
shardRouting.getIndexName(),
70+
unassignedInfo.details()
71+
);
72+
} else {
73+
return allocation.decision(
74+
Decision.NO,
75+
NAME,
76+
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. "
77+
+ "Please check constraints applied in index and cluster settings, then retry the restore. "
78+
+ "See [%s] for more details on using the allocation explain API.",
79+
ReferenceDocs.ALLOCATION_EXPLAIN_API
80+
);
81+
}
6182
}
6283

6384
@Override

server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/RestoreInProgressAllocationDeciderTests.java

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040

4141
import static java.util.Collections.singletonList;
4242
import static org.elasticsearch.cluster.routing.RoutingNodesHelper.shardsWithState;
43-
import static org.hamcrest.Matchers.equalTo;
4443
import static org.hamcrest.Matchers.startsWith;
4544

4645
/**
@@ -80,10 +79,9 @@ public void testCannotAllocatePrimaryMissingInRestoreInProgress() {
8079
assertEquals(Decision.Type.NO, decision.type());
8180
assertThat(
8281
decision.getExplanation(),
83-
equalTo(
84-
"shard has failed to be restored from the snapshot [default:_repository:_missing/_uuid] - manually close or "
85-
+ "delete the index [test] in order to retry to restore the snapshot again or use the reroute API "
86-
+ "to force the allocation of an empty primary shard. Details: [restore_source[_repository/_missing]]"
82+
startsWith(
83+
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available nodes. "
84+
+ "Please check constraints applied in index and cluster settings, then retry the restore."
8785
)
8886
);
8987
}
@@ -105,17 +103,28 @@ public void testCanAllocatePrimaryExistingInRestoreInProgress() {
105103
routingTable = clusterState.routingTable();
106104

107105
final RestoreInProgress.State shardState;
106+
final int failureCount;
108107
if (randomBoolean()) {
109108
shardState = randomFrom(RestoreInProgress.State.STARTED, RestoreInProgress.State.INIT);
109+
failureCount = 0;
110110
} else {
111111
shardState = RestoreInProgress.State.FAILURE;
112112

113113
UnassignedInfo currentInfo = primary.unassignedInfo();
114+
UnassignedInfo.Reason reason;
115+
if (randomBoolean()) {
116+
failureCount = randomBoolean() ? 0 : 1;
117+
reason = UnassignedInfo.Reason.ALLOCATION_FAILED;
118+
} else {
119+
failureCount = 0;
120+
reason = currentInfo.reason();
121+
}
122+
114123
UnassignedInfo newInfo = new UnassignedInfo(
115-
currentInfo.reason(),
124+
reason,
116125
currentInfo.message(),
117126
new IOException("i/o failure"),
118-
currentInfo.failedAllocations(),
127+
failureCount,
119128
currentInfo.unassignedTimeNanos(),
120129
currentInfo.unassignedTimeMillis(),
121130
currentInfo.delayed(),
@@ -165,16 +174,25 @@ public void testCanAllocatePrimaryExistingInRestoreInProgress() {
165174
Decision decision = executeAllocation(clusterState, primary);
166175
if (shardState == RestoreInProgress.State.FAILURE) {
167176
assertEquals(Decision.Type.NO, decision.type());
168-
assertThat(
169-
decision.getExplanation(),
170-
startsWith(
171-
"shard has failed to be restored from the snapshot [default:_repository:_existing/_uuid]"
172-
+ " - manually close or delete the index "
173-
+ "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of "
174-
+ "an empty primary shard. Details: [restore_source[_repository/_existing], failure "
175-
+ "java.io.IOException: i/o failure"
176-
)
177-
);
177+
if (failureCount > 0) {
178+
assertThat(
179+
decision.getExplanation(),
180+
startsWith(
181+
"shard has failed to be restored from the snapshot [default:_repository:_existing/_uuid]"
182+
+ " - manually close or delete the index "
183+
+ "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of "
184+
+ "an empty primary shard. Check the logs for more information about the failure. Details:"
185+
)
186+
);
187+
} else {
188+
assertThat(
189+
decision.getExplanation(),
190+
startsWith(
191+
"Restore from snapshot failed because the configured constraints prevented allocation on any of the available "
192+
+ "nodes. Please check constraints applied in index and cluster settings, then retry the restore."
193+
)
194+
);
195+
}
178196
} else {
179197
assertEquals(Decision.Type.YES, decision.type());
180198
assertEquals("shard is currently being restored", decision.getExplanation());

0 commit comments

Comments
 (0)