diff --git a/docs/reference/cluster/allocation-explain.asciidoc b/docs/reference/cluster/allocation-explain.asciidoc index 7547dd74c5ecd..99d1bb5480112 100644 --- a/docs/reference/cluster/allocation-explain.asciidoc +++ b/docs/reference/cluster/allocation-explain.asciidoc @@ -159,10 +159,11 @@ node. <5> The decider which led to the `no` decision for the node. <6> An explanation as to why the decider returned a `no` decision, with a helpful hint pointing to the setting that led to the decision. In this example, a newly created index has <> that requires that it only be allocated to a node named `nonexistent_node`, which does not exist, so the index is unable to allocate. +[[maximum-number-of-retries-exceeded]] ====== Maximum number of retries exceeded The following response contains an allocation explanation for an unassigned -primary shard that has reached the maximum number of allocation retry attempts. +primary shard that has reached the maximum number of allocation retry attempts. [source,js] ---- @@ -195,7 +196,7 @@ primary shard that has reached the maximum number of allocation retry attempts. { "decider": "max_retry", "decision" : "NO", - "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]" + "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed&metric=none] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]" } ] } @@ -203,9 +204,11 @@ primary shard that has reached the maximum number of allocation retry attempts. } ---- // NOTCONSOLE - -If decider message indicates a transient allocation issue, use -<> to retry allocation. +When Elasticsearch is unable to allocate a shard, it will attempt to retry allocation up to +the maximum number of retries allowed. After this, Elasticsearch will stop attempting to +allocate the shard in order to prevent infinite retries which may impact cluster +performance. Run the <> API to retry allocation, which +will allocate the shard if the issue preventing allocation has been resolved. ====== No valid shard copy @@ -334,7 +337,7 @@ queued to allocate but currently waiting on other queued shards. ---- // NOTCONSOLE -This is a transient message that might appear when a large amount of shards are allocating. +This is a transient message that might appear when a large amount of shards are allocating. ===== Assigned shard @@ -437,7 +440,7 @@ cluster balance. ===== No arguments If you call the API with no arguments, {es} retrieves an allocation explanation -for an arbitrary unassigned primary or replica shard, returning any unassigned primary shards first. +for an arbitrary unassigned primary or replica shard, returning any unassigned primary shards first. [source,console] ---- diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java index b20cd3ecaf992..cc3e186709514 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java @@ -14,6 +14,7 @@ import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.common.ReferenceDocs; import org.elasticsearch.common.settings.Setting; /** @@ -72,9 +73,11 @@ private static Decision debugDecision(Decision decision, UnassignedInfo info, in return Decision.single( Decision.Type.NO, NAME, - "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [%s] to retry, [%s]", + "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - " + + "manually call [%s] to retry, and for more information, see [%s] [%s]", maxRetries, RETRY_FAILED_API, + ReferenceDocs.ALLOCATION_EXPLAIN_MAX_RETRY, info.toString() ); } else { diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java index 987fa531baf8f..f3fe488555ef5 100644 --- a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java +++ b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java @@ -81,6 +81,7 @@ public enum ReferenceDocs { X_OPAQUE_ID, FORMING_SINGLE_NODE_CLUSTERS, JDK_LOCALE_DIFFERENCES, + ALLOCATION_EXPLAIN_MAX_RETRY, // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner ; diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt index ee213c655a513..86a99bca5ee5a 100644 --- a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt +++ b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt @@ -43,3 +43,4 @@ FLOOD_STAGE_WATERMARK fix-watermark-er X_OPAQUE_ID api-conventions.html#x-opaque-id FORMING_SINGLE_NODE_CLUSTERS modules-discovery-bootstrap-cluster.html#modules-discovery-bootstrap-cluster-joining JDK_LOCALE_DIFFERENCES mapping-date-format.html#custom-date-format-locales +ALLOCATION_EXPLAIN_MAX_RETRY cluster-allocation-explain.html#maximum-number-of-retries-exceeded