From 4c5b15431044c7569a70934742c588249aa7ee70 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:10:24 -0800 Subject: [PATCH 01/12] Added additional entries for troubleshooting unhealthy cluster Reordered "Re-enable shard allocation" because not as common as other causes Added additional causes of yellow statuses Changed watermark commadn to include high and low watermark so users can make their cluster operate once again. --- .../red-yellow-cluster-status.asciidoc | 64 ++++++++++++------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index eaa7cbc9e166c..83417ebbad334 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -74,35 +74,31 @@ A shard can become unassigned for several reasons. The following tips outline th most common causes and their solutions. [discrete] -[[fix-cluster-status-reenable-allocation]] -===== Re-enable shard allocation +[[fix-cluster-status-only-one-node]] +===== Single Node Cluster -You typically disable allocation during a <> or other -cluster maintenance. If you forgot to re-enable allocation afterward, {es} will -be unable to assign shards. To re-enable allocation, reset the -`cluster.routing.allocation.enable` cluster setting. +{es} will never assign a replica to the same node as the primary shard. If you only have one node it is expected for your cluster to indicate yellow. If you prefer it to be green, then change the <> on each index to be 0. -[source,console] ----- -PUT _cluster/settings -{ - "persistent" : { - "cluster.routing.allocation.enable" : null - } -} ----- - -See https://www.youtube.com/watch?v=MiKKUdZvwnI[this video] for walkthrough of troubleshooting "no allocations are allowed". +Similarly if the number of replicas is equal to or exceeds the number of nodes, then it will not be possible to allocate one or more of the shards for the same reason. [discrete] [[fix-cluster-status-recover-nodes]] ===== Recover lost nodes Shards often become unassigned when a data node leaves the cluster. This can -occur for several reasons, ranging from connectivity issues to hardware failure. +occur for several reasons. + +* If you manually restart a node, then it will temporarily cause an unhealthy cluster until the node has recovered. + +* If you have a node that is overloaded or has stopped operating for any reason, then it will temporarily cause an unhealthy cluster. Nodes may disconnect because of prolonged garbage collection (GC) pauses, which can result from "out of memory" errors or high memory usage due to intensive search operations. See <> for more JVM related issues. + +* If nodes cannot reliably communicate due to networking issues, they may lose contact with one another. This can cause shards to become out of sync. You can often identify this issue by checking the logs for repeated messages about nodes leaving and rejoining the cluster. + After you resolve the issue and recover the node, it will rejoin the cluster. {es} will then automatically allocate any unassigned shards. +You can monitor this process by <>. You will see that the number of unallocated shards progressively reduces until green status is reached. + To avoid wasting resources on temporary issues, {es} <> by one minute by default. If you've recovered a node and don’t want to wait for the delay period, you can call the <> or add a delete phase. If you no longer need to search the data, you @@ -215,11 +212,34 @@ watermark or set it to an explicit byte value. PUT _cluster/settings { "persistent": { - "cluster.routing.allocation.disk.watermark.low": "30gb" + "cluster.routing.allocation.disk.watermark.low": "90%", + "cluster.routing.allocation.disk.watermark.high": "95%" } } ---- // TEST[s/"30gb"/null/] +**It is important to note that this is usually a temporary solution and may cause instability if the disk space is not freed up.** + +[discrete] +[[fix-cluster-status-reenable-allocation]] +===== Re-enable shard allocation + +You typically disable allocation during a <> or other +cluster maintenance. If you forgot to re-enable allocation afterward, {es} will +be unable to assign shards. To re-enable allocation, reset the +`cluster.routing.allocation.enable` cluster setting. + +[source,console] +---- +PUT _cluster/settings +{ + "persistent" : { + "cluster.routing.allocation.enable" : null + } +} +---- + +See https://www.youtube.com/watch?v=MiKKUdZvwnI[this video] for walkthrough of troubleshooting "no allocations are allowed". [discrete] [[fix-cluster-status-jvm]] @@ -267,4 +287,4 @@ POST _cluster/reroute?metric=none // TEST[s/^/PUT my-index\n/] // TEST[catch:bad_request] -See https://www.youtube.com/watch?v=6OAg9IyXFO4[this video] for a walkthrough of troubleshooting `no_valid_shard_copy`. \ No newline at end of file +See https://www.youtube.com/watch?v=6OAg9IyXFO4[this video] for a walkthrough of troubleshooting `no_valid_shard_copy`. From 8db40fdd6b6dc84af74cabda985bd08d15247995 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:02:22 -0800 Subject: [PATCH 02/12] Drive-by copyedit with suggestions for concision and some formatting fixes. Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 83417ebbad334..54ecce2864279 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -77,7 +77,7 @@ most common causes and their solutions. [[fix-cluster-status-only-one-node]] ===== Single Node Cluster -{es} will never assign a replica to the same node as the primary shard. If you only have one node it is expected for your cluster to indicate yellow. If you prefer it to be green, then change the <> on each index to be 0. +{es} will never assign a replica to the same node as the primary shard. A single-node cluster will always have yellow status. To change to green, set <> to 0 for all indices. Similarly if the number of replicas is equal to or exceeds the number of nodes, then it will not be possible to allocate one or more of the shards for the same reason. From 0cbbeddb94c4b0aa36aedbf2f39599ee29e6db84 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:03:03 -0800 Subject: [PATCH 03/12] Concision and some formatting fixes. Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 54ecce2864279..f6aeb8eb6bbb6 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -79,7 +79,7 @@ most common causes and their solutions. {es} will never assign a replica to the same node as the primary shard. A single-node cluster will always have yellow status. To change to green, set <> to 0 for all indices. -Similarly if the number of replicas is equal to or exceeds the number of nodes, then it will not be possible to allocate one or more of the shards for the same reason. +Therefore, if the number of replicas equals or exceeds the number of nodes, some shards won't be allocated. [discrete] [[fix-cluster-status-recover-nodes]] From 766be14724d443523786cb0edebd3be065e02ec2 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:03:31 -0800 Subject: [PATCH 04/12] Colon added Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index f6aeb8eb6bbb6..5a65d3f590058 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -86,7 +86,7 @@ Therefore, if the number of replicas equals or exceeds the number of nodes, some ===== Recover lost nodes Shards often become unassigned when a data node leaves the cluster. This can -occur for several reasons. +occur for several reasons: * If you manually restart a node, then it will temporarily cause an unhealthy cluster until the node has recovered. From 059159b8333fb6f64262f362a4b6dd50e510cba8 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:29:51 -0800 Subject: [PATCH 05/12] Update docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc Co-authored-by: shainaraskas <58563081+shainaraskas@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 5a65d3f590058..01ad12dc2ee32 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -75,7 +75,7 @@ most common causes and their solutions. [discrete] [[fix-cluster-status-only-one-node]] -===== Single Node Cluster +===== Single node cluster {es} will never assign a replica to the same node as the primary shard. A single-node cluster will always have yellow status. To change to green, set <> to 0 for all indices. From f7c80b0269642c94534373c5f71849e01d54326c Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:32:58 -0800 Subject: [PATCH 06/12] Title change Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 01ad12dc2ee32..921385a2817fc 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -218,7 +218,10 @@ PUT _cluster/settings } ---- // TEST[s/"30gb"/null/] -**It is important to note that this is usually a temporary solution and may cause instability if the disk space is not freed up.** +[IMPORTANT] +==== +This is usually a temporary solution and may cause instability if disk space is not freed up. +==== [discrete] [[fix-cluster-status-reenable-allocation]] From de0e848d9411de621d658ce8bfb0636fe36664a7 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Fri, 10 Jan 2025 12:20:59 -0800 Subject: [PATCH 07/12] Update docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 921385a2817fc..7793762e08141 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -97,7 +97,7 @@ occur for several reasons: After you resolve the issue and recover the node, it will rejoin the cluster. {es} will then automatically allocate any unassigned shards. -You can monitor this process by <>. You will see that the number of unallocated shards progressively reduces until green status is reached. +You can monitor this process by <>. The number of unallocated shards should progressively decrease until green status is reached. To avoid wasting resources on temporary issues, {es} <> by one minute by default. If you've recovered a node and don’t want From 49dea5c52e79f0a37faa8d098e46980c0de73c05 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:04:02 -0800 Subject: [PATCH 08/12] Spelling fix --- .../common-issues/red-yellow-cluster-status.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 7793762e08141..89a3fea36f8ce 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -77,7 +77,7 @@ most common causes and their solutions. [[fix-cluster-status-only-one-node]] ===== Single node cluster -{es} will never assign a replica to the same node as the primary shard. A single-node cluster will always have yellow status. To change to green, set <> to 0 for all indices. +{es} will never assign a replica to the same node as the primary shard. A single-node cluster will always have yellow status. To change to green, set <> to 0 for all indices. Therefore, if the number of replicas equals or exceeds the number of nodes, some shards won't be allocated. From f43d0695b20194abd2f1487bc6055414b5378a06 Mon Sep 17 00:00:00 2001 From: George Wallace Date: Tue, 14 Jan 2025 13:07:33 -0700 Subject: [PATCH 09/12] Update docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 89a3fea36f8ce..cc72cde64f760 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -88,11 +88,11 @@ Therefore, if the number of replicas equals or exceeds the number of nodes, some Shards often become unassigned when a data node leaves the cluster. This can occur for several reasons: -* If you manually restart a node, then it will temporarily cause an unhealthy cluster until the node has recovered. +* A manual node restart will cause a temporary unhealthy cluster state until the node recovers. -* If you have a node that is overloaded or has stopped operating for any reason, then it will temporarily cause an unhealthy cluster. Nodes may disconnect because of prolonged garbage collection (GC) pauses, which can result from "out of memory" errors or high memory usage due to intensive search operations. See <> for more JVM related issues. +* When a node becomes overloaded or fails, it can temporarily disrupt the cluster’s health, leading to an unhealthy state. Prolonged garbage collection (GC) pauses, caused by out-of-memory errors or high memory usage during intensive searches, can trigger this state. See <> for more JVM-related issues. -* If nodes cannot reliably communicate due to networking issues, they may lose contact with one another. This can cause shards to become out of sync. You can often identify this issue by checking the logs for repeated messages about nodes leaving and rejoining the cluster. +* Network issues can prevent reliable node communication, causing shards to become out of sync. Check the logs for repeated messages about nodes leaving and rejoining the cluster. After you resolve the issue and recover the node, it will rejoin the cluster. {es} will then automatically allocate any unassigned shards. From cddf124578e9aff536edc77a08f6704082af56f8 Mon Sep 17 00:00:00 2001 From: Kofi B <23384471+thekofimensah@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:41:01 -0800 Subject: [PATCH 10/12] Update docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc Co-authored-by: George Wallace --- .../common-issues/red-yellow-cluster-status.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index cc72cde64f760..1d4baccccc712 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -218,6 +218,7 @@ PUT _cluster/settings } ---- // TEST[s/"30gb"/null/] + [IMPORTANT] ==== This is usually a temporary solution and may cause instability if disk space is not freed up. From 2bdac0f6fc9a077223e662924e20e9cf42a1643a Mon Sep 17 00:00:00 2001 From: George Wallace Date: Wed, 15 Jan 2025 10:51:57 -0700 Subject: [PATCH 11/12] Update docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 1d4baccccc712..75200f5ae1aaa 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -217,7 +217,7 @@ PUT _cluster/settings } } ---- -// TEST[s/"30gb"/null/] +// TEST[s/"90%"/null/] [IMPORTANT] ==== From 2ba831a081fa71c2dcb48d4114edcd9a9b1f8e62 Mon Sep 17 00:00:00 2001 From: Kofi B Date: Wed, 15 Jan 2025 10:54:00 -0800 Subject: [PATCH 12/12] Update docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc Co-authored-by: Liam Thompson <32779855+leemthompo@users.noreply.github.com> --- .../common-issues/red-yellow-cluster-status.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc index 75200f5ae1aaa..79a541eba609e 100644 --- a/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc +++ b/docs/reference/troubleshooting/common-issues/red-yellow-cluster-status.asciidoc @@ -218,6 +218,7 @@ PUT _cluster/settings } ---- // TEST[s/"90%"/null/] +// TEST[s/"95%"/null/] [IMPORTANT] ====