Add link to MAX_RETRY allocation explain docs (elastic#113657)

matthewabbott · georgewallace · commit cc4a60418cf7 · 2024-10-24T20:40:58.000-06:00
diff --git a/docs/reference/cluster/allocation-explain.asciidoc b/docs/reference/cluster/allocation-explain.asciidoc
@@ -159,6 +159,7 @@ node.
 <5> The decider which led to the `no` decision for the node.
 <6> An explanation as to why the decider returned a `no` decision, with a helpful hint pointing to the setting that led to the decision. In this example, a newly created index has <<indices-get-settings,an index setting>> that requires that it only be allocated to a node named `nonexistent_node`, which does not exist, so the index is unable to allocate.
 
+[[maximum-number-of-retries-exceeded]]
 ====== Maximum number of retries exceeded
 
 The following response contains an allocation explanation for an unassigned
@@ -195,17 +196,19 @@ primary shard that has reached the maximum number of allocation retry attempts.
         {
           "decider": "max_retry",
           "decision" : "NO",
-          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
+          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
         }
       ]
     }
   ]
 }
 ----
 // NOTCONSOLE
-
-If decider message indicates a transient allocation issue, use
-the <<cluster-reroute,cluster reroute>> API to retry allocation.
+When Elasticsearch is unable to allocate a shard, it will attempt to retry allocation up to
+the maximum number of retries allowed. After this, Elasticsearch will stop attempting to
+allocate the shard in order to prevent infinite retries which may impact cluster
+performance. Run the <<cluster-reroute,cluster reroute>> API to retry allocation, which
+will allocate the shard if the issue preventing allocation has been resolved.
 
 [[no-valid-shard-copy]]
 ====== No valid shard copy
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java
@@ -14,6 +14,7 @@
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.UnassignedInfo;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.Setting;
 
 /**
@@ -72,9 +73,11 @@ private static Decision debugDecision(Decision decision, UnassignedInfo info, in
             return Decision.single(
                 Decision.Type.NO,
                 NAME,
-                "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [%s] to retry, [%s]",
+                "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - "
+                    + "manually call [%s] to retry, and for more information, see [%s] [%s]",
                 maxRetries,
                 RETRY_FAILED_API,
+                ReferenceDocs.ALLOCATION_EXPLAIN_MAX_RETRY,
                 info.toString()
             );
         } else {
diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -82,6 +82,7 @@ public enum ReferenceDocs {
     FORMING_SINGLE_NODE_CLUSTERS,
     CIRCUIT_BREAKER_ERRORS,
     ALLOCATION_EXPLAIN_NO_COPIES,
+    ALLOCATION_EXPLAIN_MAX_RETRY,
     // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
     ;
 
diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt
@@ -44,3 +44,4 @@ X_OPAQUE_ID                                                     api-conventions.
 FORMING_SINGLE_NODE_CLUSTERS                                    modules-discovery-bootstrap-cluster.html#modules-discovery-bootstrap-cluster-joining
 CIRCUIT_BREAKER_ERRORS                                          circuit-breaker-errors.html
 ALLOCATION_EXPLAIN_NO_COPIES                                    cluster-allocation-explain.html#no-valid-shard-copy
+ALLOCATION_EXPLAIN_MAX_RETRY                                    cluster-allocation-explain.html#maximum-number-of-retries-exceeded

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,7 @@ node.`
`159`	`159`	<5> The decider which led to the `no` decision for the node.
`160`	`160`	<6> An explanation as to why the decider returned a `no` decision, with a helpful hint pointing to the setting that led to the decision. In this example, a newly created index has <<indices-get-settings,an index setting>> that requires that it only be allocated to a node named `nonexistent_node`, which does not exist, so the index is unable to allocate.
`161`	`161`
	`162`	`+[[maximum-number-of-retries-exceeded]]`
`162`	`163`	`====== Maximum number of retries exceeded`
`163`	`164`
`164`	`165`	`The following response contains an allocation explanation for an unassigned`
`@@ -195,17 +196,19 @@ primary shard that has reached the maximum number of allocation retry attempts.`
`195`	`196`	`{`
`196`	`197`	`"decider": "max_retry",`
`197`	`198`	`"decision" : "NO",`
`198`		`- "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"`
	`199`	`+ "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"`
`199`	`200`	`}`
`200`	`201`	`]`
`201`	`202`	`}`
`202`	`203`	`]`
`203`	`204`	`}`
`204`	`205`	`----`
`205`	`206`	`// NOTCONSOLE`
`206`		`-`
`207`		`-If decider message indicates a transient allocation issue, use`
`208`		`-the <<cluster-reroute,cluster reroute>> API to retry allocation.`
	`207`	`+When Elasticsearch is unable to allocate a shard, it will attempt to retry allocation up to`
	`208`	`+the maximum number of retries allowed. After this, Elasticsearch will stop attempting to`
	`209`	`+allocate the shard in order to prevent infinite retries which may impact cluster`
	`210`	`+performance. Run the <<cluster-reroute,cluster reroute>> API to retry allocation, which`
	`211`	`+will allocate the shard if the issue preventing allocation has been resolved.`
`209`	`212`
`210`	`213`	`[[no-valid-shard-copy]]`
`211`	`214`	`====== No valid shard copy`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ public enum ReferenceDocs {`
`82`	`82`	`FORMING_SINGLE_NODE_CLUSTERS,`
`83`	`83`	`CIRCUIT_BREAKER_ERRORS,`
`84`	`84`	`ALLOCATION_EXPLAIN_NO_COPIES,`
	`85`	`+ ALLOCATION_EXPLAIN_MAX_RETRY,`
`85`	`86`	`// this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner`
`86`	`87`	`;`
`87`	`88`