From 9579cf7beb18051726b9019de1b9a713600786fe Mon Sep 17 00:00:00 2001 From: Marci W <333176+marciw@users.noreply.github.com> Date: Mon, 17 Feb 2025 16:25:45 -0500 Subject: [PATCH 1/4] clean it up --- .../ece-config-change-errors.md | 130 ------- .../cloud-enterprise/ece-deployment-no-op.md | 17 - .../ech-analyze_shards_with-api.md | 128 ------- .../ech-analyze_shards_with-kibana.md | 20 -- .../cloud-heroku/ech-config-change-errors.md | 139 -------- .../ech-cpu-usage-exceed-allowed-threshold.md | 82 ----- .../cloud-heroku/ech-deployment-no-op.md | 21 -- ...jvm-heap-usage-exceed-allowed-threshold.md | 63 ---- .../ech-multiple-node-deployment-disk-used.md | 58 --- .../ech-nodes-unavailable-missing.md | 30 -- ...remediate-issues-allocation-explain-API.md | 142 -------- .../ech-single-node-deployment-disk-used.md | 46 --- ...o_is_my_cluster_really_highly_available.md | 16 - ...r_response_times_suddenly_so_much_worse.md | 16 - .../echscenario_why_are_shards_unavailable.md | 29 -- .../echscenario_why_is_my_node_unavailable.md | 26 -- ..._why_is_performance_degrading_over_time.md | 14 - .../cloud/cloud/ec-config-change-errors.md | 139 -------- .../cloud/cloud/ec-deployment-no-op.md | 21 -- ...o_is_my_cluster_really_highly_available.md | 16 - ...r_response_times_suddenly_so_much_worse.md | 16 - .../ec-scenario_why_are_shards_unavailable.md | 319 ----------------- .../ec-scenario_why_is_my_node_unavailable.md | 309 ---------------- ..._why_is_performance_degrading_over_time.md | 14 - raw-migrated-files/toc.yml | 24 -- troubleshoot/monitoring/cloud.md | 5 + .../monitoring/cluster-response-time.md | 20 +- .../monitoring/deployment-health-warnings.md | 28 +- troubleshoot/monitoring/high-availability.md | 21 +- .../monitoring/high-memory-pressure.md | 53 ++- troubleshoot/monitoring/node-bootlooping.md | 145 ++++++-- troubleshoot/monitoring/node-moves-outages.md | 1 + troubleshoot/monitoring/performance.md | 18 +- troubleshoot/monitoring/unavailable-nodes.md | 330 +++++++++++++++++- troubleshoot/monitoring/unavailable-shards.md | 327 +++++++++++++++-- troubleshoot/toc.yml | 7 +- 36 files changed, 841 insertions(+), 1949 deletions(-) delete mode 100644 raw-migrated-files/cloud/cloud-enterprise/ece-config-change-errors.md delete mode 100644 raw-migrated-files/cloud/cloud-enterprise/ece-deployment-no-op.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-api.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-kibana.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-config-change-errors.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-cpu-usage-exceed-allowed-threshold.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-deployment-no-op.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-jvm-heap-usage-exceed-allowed-threshold.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-multiple-node-deployment-disk-used.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-nodes-unavailable-missing.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-remediate-issues-allocation-explain-API.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/ech-single-node-deployment-disk-used.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/echscenario_is_my_cluster_really_highly_available.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_shards_unavailable.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_my_node_unavailable.md delete mode 100644 raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_performance_degrading_over_time.md delete mode 100644 raw-migrated-files/cloud/cloud/ec-config-change-errors.md delete mode 100644 raw-migrated-files/cloud/cloud/ec-deployment-no-op.md delete mode 100644 raw-migrated-files/cloud/cloud/ec-scenario_is_my_cluster_really_highly_available.md delete mode 100644 raw-migrated-files/cloud/cloud/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md delete mode 100644 raw-migrated-files/cloud/cloud/ec-scenario_why_are_shards_unavailable.md delete mode 100644 raw-migrated-files/cloud/cloud/ec-scenario_why_is_my_node_unavailable.md delete mode 100644 raw-migrated-files/cloud/cloud/ec-scenario_why_is_performance_degrading_over_time.md diff --git a/raw-migrated-files/cloud/cloud-enterprise/ece-config-change-errors.md b/raw-migrated-files/cloud/cloud-enterprise/ece-config-change-errors.md deleted file mode 100644 index fed68c1a9a..0000000000 --- a/raw-migrated-files/cloud/cloud-enterprise/ece-config-change-errors.md +++ /dev/null @@ -1,130 +0,0 @@ -# How do I resolve node bootlooping? [ece-config-change-errors] - -When you attempt to apply a configuration change to a deployment, the attempt may fail with an error indicating that the change could not be applied, and deployment resources may be unable to restart. In some cases, bootlooping may result, where the deployment resources cycle through a continual reboot process. - -:::{image} ../../../images/cloud-enterprise-ec-ce-configuration-change-failure.png -:alt: A screen capture of the deployment page showing an error: Latest change to {{es}} configuration failed. -::: - -To confirm if your Elasticsearch cluster is bootlooping, you can check the most recent plan under your [Deployment Activity page](../../../deploy-manage/deploy/cloud-enterprise/keep-track-of-deployment-activity.md) for the error: - -```sh -Plan change failed: Some instances were unable to start properly. -``` - -If this occurs, correlating {{es}} logs should report: - -```sh -fatal exception while booting Elasticsearch -``` - -Following are some frequent causes of a failed configuration change: - -1. [Secure settings](../../../troubleshoot/monitoring/node-bootlooping.md#ece-config-change-errors-secure-settings) -2. [Expired custom plugins or bundles](../../../troubleshoot/monitoring/node-bootlooping.md#ece-config-change-errors-expired-bundle-extension) -3. [OOM errors](../../../troubleshoot/monitoring/node-bootlooping.md#ece-config-change-errors-oom-errors) -4. [Existing index](../../../troubleshoot/monitoring/node-bootlooping.md#ece-config-change-errors-existing-index) -5. [Insufficient Storage](../../../troubleshoot/monitoring/node-bootlooping.md#ece-config-change-errors-insufficient-storage) - -If you’re unable to remediate the failing plan’s root cause, you can attempt to reset the deployment to the latest successful {{es}} configuration by performing a [no-op plan](../../../troubleshoot/monitoring/deployment-health-warnings.md). For an example, see this [video walkthrough](https://www.youtube.com/watch?v=8MnXZ9egBbQ). - - -## Secure settings [ece-config-change-errors-secure-settings] - -The most frequent cause of a failed deployment configuration change is due to invalid or mislocated [secure settings](../../../deploy-manage/security/secure-settings.md). This can frequently be discovered by searching {{es}} logs for one of the following error messages: - -```sh -IllegalStateException: security initialization failed -java.lang.IllegalArgumentException: unknown secure setting -``` - -The keystore allows you to safely store sensitive settings, such as passwords, as a key/value pair. You can then access a secret value from a settings file by referencing its key. Importantly, not all settings can be stored in the keystore, and the keystore does not validate the settings that you add. Adding unsupported settings can cause {{es}} or other components to fail to restart. To check whether a setting is supported in the keystore, look for a "Secure" qualifier in the [lists of reloadable settings](../../../deploy-manage/security/secure-settings.md). - -The following sections detail some secure settings problems that can result in a configuration change error that can prevent a deployment from restarting. You might diagnose these plan failures via the logs or via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `1`, `3`, and `78`. - - -### Invalid or outdated values [ece-config-change-errors-old-values] - -The keystore does not validate any settings that you add, so invalid or outdated values are a common source of errors when you apply a configuration change to a deployment. - -To check the current set of stored settings: - -1. Open the deployment **Security** page. -2. In the **{{es}} keystore** section, check the **Security keys** list. The list is shown only if you currently have settings configured in the keystore. - -One frequent cause of errors is when settings in the keystore are no longer valid, such as when SAML settings are added for a test environment, but the settings are either not carried over or no longer valid in a production environment. - - -### Snapshot repositories [ece-config-change-errors-snapshot-repos] - -Sometimes, settings added to the keystore to connect to a snapshot repository may not be valid. When this happens, you may get an error such as `SettingsException[Neither a secret key nor a shared access token was set.]` - -For example, when adding an [Azure repository storage setting](../../../deploy-manage/tools/snapshot-and-restore/azure-repository.md#repository-azure-usage) such as `azure.client.default.account` to the keystore, the associated setting `azure.client.default.key` must also be added for the configuration to be valid. - - -### Third-party authentication [ece-config-change-errors-third-party-auth] - -When you configure third-party authentication, it’s important that all required configuration elements that are stored in the keystore are included in the {{es}} user settings file. For example, when you [create a SAML realm](../../../deploy-manage/users-roles/cluster-or-deployment-auth/saml.md#saml-create-realm), omitting a field such as `idp.entity_id` when that setting is present in the keystore results in a failed configuration change. - - -### Wrong location [ece-config-change-errors-wrong-location] - -In some cases, settings may accidentally be added to the keystore that should have been added to the [{{es}} user settings file](../../../deploy-manage/deploy/cloud-enterprise/edit-stack-settings.md). It’s always a good idea to check the [lists of reloadable settings](../../../deploy-manage/security/secure-settings.md) to determine if a setting can be stored in the keystore. Settings that can safely be added to the keystore are flagged as `Secure`. - - -## Expired custom plugins or bundles [ece-config-change-errors-expired-bundle-extension] - -During the process of applying a configuration change, Elastic Cloud Enterprise checks to determine if any [uploaded custom plugins or bundles](../../../solutions/search/full-text/search-with-synonyms.md) are expired. - -Problematic plugins produce oscillating {{es}} start-up logs like the following: - -```sh -Booting at Sun Sep 4 03:06:43 UTC 2022 -Installing user plugins. -Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... -/app/elasticsearch.sh: line 169: [: too many arguments -Booting at Sun Sep 4 03:06:58 UTC 2022 -Installing user plugins. -Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... -/app/elasticsearch.sh: line 169: [: too many arguments -``` - -Problematic bundles produce similar oscillations but their install log would appear like - -```sh -2024-11-17 15:18:02 https://found-user-plugins.s3.amazonaws.com/XXXXX/XXXXX.zip?response-content-disposition=attachment%3Bfilename%XXXXX%2F4007535947.zip&x-elastic-extension-version=1574194077471&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20241016T133214Z&X-Amz-SignedHeaders=host&X-Amz-Expires=86400&XAmz-Credential=XXXXX%2F20201016%2Fus-east-1%2Fs3%2Faws4_request&X-AmzSignature=XXXXX -``` - -Noting in example that the bundle’s expiration `X-Amz-Date=20241016T133214Z` is before than the log timestamp `2024-11-17 15:18:02` so this bundle is considered expired. - -To view any added plugins or bundles: - -1. Go to the **Features** page and open the **Extensions** tab. -2. Select any extension and then choose **Update extension** to renew it. No other changes are needed, and any associated configuration change failures should now be able to succeed. - - -## OOM errors [ece-config-change-errors-oom-errors] - -Configuration change errors can occur when there is insufficient RAM configured for a data tier. In this case, the cluster typically also shows OOM (out of memory) errors. To resolve these, you need to increase the amount of heap memory, which is half of the amount of memory allocated to a cluster. You might also detect OOM in plan changes via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `127`, `137`, and `158`. - -As well, you can read our detailed blog [Managing and troubleshooting {{es}} memory](https://www.elastic.co/blog/managing-and-troubleshooting-elasticsearch-memory). - - -## Existing index [ece-config-change-errors-existing-index] - -In rare cases, when you attempt to upgrade the version of a deployment and the upgrade fails on the first attempt, subsequent attempts to upgrade may fail due to already existing resources. The problem may be due to the system preventing itself from overwriting existing indices, resulting in an error such as this: `Another Kibana instance appears to be migrating the index. Waiting for that migration to complete. If no other Kibana instance is attempting migrations, you can get past this message by deleting index .kibana_2 and restarting Kibana`. - -To resolve this: - -1. Check that you don’t need the content. -2. Run an {{es}} [Delete index request](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete) to remove the existing index. - - In this example, the `.kibana_2` index is the rollover of saved objects (such as Kibana visualizations or dashboards) from the original `.kibana_1` index. Since `.kibana_2` was created as part of the failed upgrade process, this index does not yet contain any pertinent data and it can safely be deleted. - -3. Retry the deployment configuration change. - - -## Insufficient Storage [ece-config-change-errors-insufficient-storage] - -Configuration change errors can occur when there is insufficient disk space for a data tier. To resolve this, you need to increase the size of that tier to ensure it provides enough storage to accommodate the data in your cluster tier considering the [high watermark](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html#disk-based-shard-allocation). For troubleshooting walkthrough, see [Fix watermark errors](https://www.elastic.co/guide/en/elasticsearch/reference/current/fix-watermark-errors.html). - diff --git a/raw-migrated-files/cloud/cloud-enterprise/ece-deployment-no-op.md b/raw-migrated-files/cloud/cloud-enterprise/ece-deployment-no-op.md deleted file mode 100644 index 84f1ece137..0000000000 --- a/raw-migrated-files/cloud/cloud-enterprise/ece-deployment-no-op.md +++ /dev/null @@ -1,17 +0,0 @@ -# How do I resolve deployment health warnings? [ece-deployment-no-op] - -The Elastic Cloud Enterprise **Deployments** page shows the current status of your active deployments. From time to time you may get one or more health warnings, such as the following: - -:::{image} ../../../images/cloud-enterprise-ec-ce-deployment-health-warning.png -:alt: A screen capture of the deployment page showing a typical warning: Deployment health warning: Latest change to {{es}} configuration failed. -::: - -**Seeing only one warning?** - -To resolve a single health warning, we recommended first re-applying any pending changes: Select **Edit** in the deployment menu to open the Edit page and then click **Save** without making any changes. This will check all components for pending changes and will apply the changes as needed. This may impact the uptime of clusters which are not [highly available](../../../deploy-manage/deploy/cloud-enterprise/ece-ha.md). - -Re-saving the deployment configuration without making any changes is often all that’s needed to resolve a transient health warning on the UI. Saving will redirect you to the Elastic Cloud Enterprise deployment [Activity page](../../../deploy-manage/deploy/cloud-enterprise/keep-track-of-deployment-activity.md) where you can monitor plan completion. Repeat errors should be investigated; for more information refer to [resolving configuration change errors](../../../troubleshoot/monitoring/node-bootlooping.md). - -**Seeing multiple warnings?** - -If multiple health warnings appear for one of your deployments, check the list of [Common issues](../../../troubleshoot/deployments/cloud-enterprise/common-issues.md), or [Ask for help](../../../troubleshoot/deployments/cloud-enterprise/ask-for-help.md) if you cannot resolve the problem yourself. diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-api.md b/raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-api.md deleted file mode 100644 index 3e6fc23c45..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-api.md +++ /dev/null @@ -1,128 +0,0 @@ -# Analyze unassigned shards using the {{es}} API [ech-analyze_shards_with-api] - -You can retrieve information about the status of your cluster, indices, and shards using the {{es}} API. To access the API you can either use the [Kibana Dev Tools Console](../../../explore-analyze/query-filter/tools/console.md), or the [Elasticsearch API console](../../../deploy-manage/deploy/elastic-cloud/ech-api-console.md). This section shows you how to: - -* [Check cluster health](ech-analyze_shards_with-api.md#ech-check-cluster-health) -* [Check unhealthy indices](ech-analyze_shards_with-api.md#ech-check-unhealthy-indices) -* [Check which shards are unassigned](ech-analyze_shards_with-api.md#ech-check-which-unassigned-shards) -* [Check why shards are unassigned](ech-analyze_shards_with-api.md#ech-check-why-unassigned-shards) -* [Check Elasticsearch cluster logs](ech-analyze_shards_with-api.md#ech-check-es-cluster-logs) - - -## Check cluster health [ech-check-cluster-health] - -Use the [Cluster health API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-health): - -```json -GET /_cluster/health/ -``` - -This command returns the cluster status (green, yellow, or red) and shows the number of unassigned shards: - -```json -{ - "cluster_name" : "xxx", - "status" : "red", - "timed_out" : false, - "number_of_nodes" : "x", - "number_of_data_nodes" : "x", - "active_primary_shards" : 116, - "active_shards" : 229, - "relocating_shards" : 0, - "initializing_shards" : 0, - "unassigned_shards" : 1, - "delayed_unassigned_shards" : 0, - "number_of_pending_tasks" : 0, - "number_of_inflight_fetch" : 0, - "task_max_waiting_in_queue_millis" : 0, - "active_shards_percent_as_number" : 98.70689655172413 -} -``` - - -## Check unhealthy indices [ech-check-unhealthy-indices] - -Use the [cat indices API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-indices) to get the status of individual indices. Specify the `health` parameter to limit the results to a particular status, for example `?v&health=red` or `?v&health=yellow`. - -```json -GET /_cat/indices?v&health=red -``` - -This command returns any indices that have unassigned primary shards (red status): - -```json -red open filebeat-7.10.0-2022.01.07-000014 C7N8fxGwRxK0JcwXH18zVg 1 1 -red open filebeat-7.9.3-2022.01.07-000015 Ib4UIJNVTtOg6ovzs011Lq 1 1 -``` - -For more information, refer to [Fix a red or yellow cluster status](../../../troubleshoot/elasticsearch/red-yellow-cluster-status.md#fix-red-yellow-cluster-status). - - -## Check which shards are unassigned [ech-check-which-unassigned-shards] - -Use the [cat shards API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-shards): - -```json -GET /_cat/shards/?v -``` - -This command returns the index name, followed by the shard type and shard status: - -```json -filebeat-7.10.0-2022.01.07-000014 0 P UNASSIGNED -filebeat-7.9.3-2022.01.07-000015 1 P UNASSIGNED -filebeat-7.9.3-2022.01.07-000015 2 r UNASSIGNED -``` - - -## Check why shards are unassigned [ech-check-why-unassigned-shards] - -To understand why shards are unassigned, run the [Cluster allocation explain API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-allocation-explain). - -Running the API call `GET _cluster/allocation/explain` retrieves an allocation explanation for unassigned primary shards, or replica shards. - -For example, if `_cat/health` shows that the primary shard of shard 1 in the `filebeat-7.9.3-2022.01.07-000015` index is unassigned, you can get the allocation explanation with the following request: - -```json -GET _cluster/allocation/explain -{ - "index": "filebeat-7.9.3-2022.01.07-000015", - "shard": 1, - "primary": true -} -``` - -The response is as follows: - -```json -{ - "index": "filebeat-7.9.3-2022.01.07-000015", - "shard": 1, - "primary": true, - "current_state": "unassigned", - "unassigned_info": { - "reason": "CLUSTER_RECOVERED", - "at": "2022-04-12T13:06:36.125Z", - "last_allocation_status": "no_valid_shard_copy" - }, - "can_allocate": "no_valid_shard_copy", - "allocate_explanation": "cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster", - "node_allocation_decisions": [ - { - "node_id": "xxxx", - "node_name": "instance-0000000005", - (... skip ...) - "node_decision": "no", - "store": { - "found": false - } - } - ] -} -``` - - -## Check {{es}} cluster logs [ech-check-es-cluster-logs] - -To determine the allocation issue, you can [check the logs](ech-monitoring-setup.md#ech-check-logs). This is easier if you have set up a dedicated monitoring deployment. - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-kibana.md b/raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-kibana.md deleted file mode 100644 index c82549c968..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-kibana.md +++ /dev/null @@ -1,20 +0,0 @@ -# Analyze unassigned shards using the Kibana UI [ech-analyze_shards_with-kibana] - -If you are shipping logs and metrics to a monitoring deployment, go through the following steps. - -1. Select your deployment from the {{es}} Service panel and navigate to the **Logs and metrics** page. -2. Click **Enable**. -3. Choose the deployment where to send your logs and metrics. -4. Click **Save**. It might take a few minutes to apply the configuration changes. -5. Click **View** to open the Kibana UI and get more details on metrics and logs. - -:::{image} ../../../images/cloud-heroku-ec-logs-metrics-page.png -:alt: Log and metrics page -::: - -The unhealthy indices appear with a red or yellow status. - -:::{image} ../../../images/cloud-heroku-ec-red-yellow-indices.png -:alt: Unhealthy indices in red or yellow status -::: - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-config-change-errors.md b/raw-migrated-files/cloud/cloud-heroku/ech-config-change-errors.md deleted file mode 100644 index 4c17bdb81c..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-config-change-errors.md +++ /dev/null @@ -1,139 +0,0 @@ -# How do I resolve node bootlooping? [ech-config-change-errors] - -When you attempt to apply a configuration change to a deployment, the attempt may fail with an error indicating that the change could not be applied, and deployment resources may be unable to restart. In some cases, bootlooping may result, where the deployment resources cycle through a continual reboot process. - -:::{image} ../../../images/cloud-heroku-ec-ce-configuration-change-failure.png -:alt: A screen capture of the deployment page showing an error: Latest change to {{es}} configuration failed. -::: - -To help diagnose these and any other types of issues in your deployments, we recommend [setting up monitoring](../../../deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md). Then, you can easily view your deployment health and access log files to troubleshoot this configuration failure. - -To confirm if your Elasticsearch cluster is bootlooping, you can check the most recent plan under your [Deployment Activity page](../../../deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) for the error: - -```sh -Plan change failed: Some instances were unable to start properly. -``` - -If this occurs, correlating {{es}} logs should report: - -```sh -fatal exception while booting Elasticsearch -``` - -Following are some frequent causes of a failed configuration change: - -1. [Secure settings](../../../troubleshoot/monitoring/node-bootlooping.md#ech-config-change-errors-secure-settings) -2. [Expired custom plugins or bundles](../../../troubleshoot/monitoring/node-bootlooping.md#ech-config-change-errors-expired-bundle-extension) -3. [OOM errors](../../../troubleshoot/monitoring/node-bootlooping.md#ech-config-change-errors-oom-errors) -4. [Existing index](../../../troubleshoot/monitoring/node-bootlooping.md#ech-config-change-errors-existing-index) -5. [Insufficient Storage](../../../troubleshoot/monitoring/node-bootlooping.md#ech-config-change-errors-insufficient-storage) - -If you’re unable to remediate the failing plan’s root cause, you can attempt to reset the deployment to the latest successful {{es}} configuration by performing a [no-op plan](../../../troubleshoot/monitoring/deployment-health-warnings.md). For an example, see this [video walkthrough](https://www.youtube.com/watch?v=8MnXZ9egBbQ). - - -## Secure settings [ech-config-change-errors-secure-settings] - -The most frequent cause of a failed deployment configuration change is due to invalid or mislocated [secure settings](../../../deploy-manage/security/secure-settings.md). This can frequently be discovered by searching {{es}} logs for one of the following error messages: - -```sh -IllegalStateException: security initialization failed -java.lang.IllegalArgumentException: unknown secure setting -``` - -These are settings typically added to the keystore for the purpose of: - -1. Setting up third-party authentication, for example [SAML](../../../deploy-manage/users-roles/cluster-or-deployment-auth/saml.md), [OpenID Connect](../../../deploy-manage/users-roles/cluster-or-deployment-auth/openid-connect.md), or [Kerberos](../../../deploy-manage/users-roles/cluster-or-deployment-auth/kerberos.md). -2. Setting up a [custom repository](../../../deploy-manage/tools/snapshot-and-restore/elastic-cloud-hosted.md). - -The keystore allows you to safely store sensitive settings, such as passwords, as a key/value pair. You can then access a secret value from a settings file by referencing its key. Importantly, not all settings can be stored in the keystore, and the keystore does not validate the settings that you add. Adding unsupported settings can cause {{es}} or other components to fail to restart. To check whether a setting is supported in the keystore, look for a "Secure" qualifier in the [lists of reloadable settings](../../../deploy-manage/security/secure-settings.md). - -The following sections detail some secure settings problems that can result in a configuration change error that can prevent a deployment from restarting. You might diagnose these plan failures via the logs or via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `1`, `3`, and `78`. - - -### Invalid or outdated values [ech-config-change-errors-old-values] - -The keystore does not validate any settings that you add, so invalid or outdated values are a common source of errors when you apply a configuration change to a deployment. - -To check the current set of stored settings: - -1. Open the deployment **Security** page. -2. In the **{{es}} keystore** section, check the **Security keys** list. The list is shown only if you currently have settings configured in the keystore. - -One frequent cause of errors is when settings in the keystore are no longer valid, such as when SAML settings are added for a test environment, but the settings are either not carried over or no longer valid in a production environment. - - -### Snapshot repositories [ech-config-change-errors-snapshot-repos] - -Sometimes, settings added to the keystore to connect to a snapshot repository may not be valid. When this happens, you may get an error such as `SettingsException[Neither a secret key nor a shared access token was set.]` - -For example, when adding an [Azure repository storage setting](../../../deploy-manage/tools/snapshot-and-restore/azure-repository.md#repository-azure-usage) such as `azure.client.default.account` to the keystore, the associated setting `azure.client.default.key` must also be added for the configuration to be valid. - - -### Third-party authentication [ech-config-change-errors-third-party-auth] - -When you configure third-party authentication, it’s important that all required configuration elements that are stored in the keystore are included in the {{es}} user settings file. For example, when you [create a SAML realm](../../../deploy-manage/users-roles/cluster-or-deployment-auth/saml.md#saml-create-realm), omitting a field such as `idp.entity_id` when that setting is present in the keystore results in a failed configuration change. - - -### Wrong location [ech-config-change-errors-wrong-location] - -In some cases, settings may accidentally be added to the keystore that should have been added to the [{{es}} user settings file](../../../deploy-manage/deploy/elastic-cloud/edit-stack-settings.md). It’s always a good idea to check the [lists of reloadable settings](../../../deploy-manage/security/secure-settings.md) to determine if a setting can be stored in the keystore. Settings that can safely be added to the keystore are flagged as `Secure`. - - -## Expired custom plugins or bundles [ech-config-change-errors-expired-bundle-extension] - -During the process of applying a configuration change, Elasticsearch Add-On for Heroku checks to determine if any [uploaded custom plugins or bundles](../../../deploy-manage/deploy/elastic-cloud/upload-custom-plugins-bundles.md) are expired. - -Problematic plugins produce oscillating {{es}} start-up logs like the following: - -```sh -Booting at Sun Sep 4 03:06:43 UTC 2022 -Installing user plugins. -Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... -/app/elasticsearch.sh: line 169: [: too many arguments -Booting at Sun Sep 4 03:06:58 UTC 2022 -Installing user plugins. -Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... -/app/elasticsearch.sh: line 169: [: too many arguments -``` - -Problematic bundles produce similar oscillations but their install log would appear like - -```sh -2024-11-17 15:18:02 https://found-user-plugins.s3.amazonaws.com/XXXXX/XXXXX.zip?response-content-disposition=attachment%3Bfilename%XXXXX%2F4007535947.zip&x-elastic-extension-version=1574194077471&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20241016T133214Z&X-Amz-SignedHeaders=host&X-Amz-Expires=86400&XAmz-Credential=XXXXX%2F20201016%2Fus-east-1%2Fs3%2Faws4_request&X-AmzSignature=XXXXX -``` - -Noting in example that the bundle’s expiration `X-Amz-Date=20241016T133214Z` is before than the log timestamp `2024-11-17 15:18:02` so this bundle is considered expired. - -To view any added plugins or bundles: - -1. Go to the **Features** page and open the **Extensions** tab. -2. Select any extension and then choose **Update extension** to renew it. No other changes are needed, and any associated configuration change failures should now be able to succeed. - - -## OOM errors [ech-config-change-errors-oom-errors] - -Configuration change errors can occur when there is insufficient RAM configured for a data tier. In this case, the cluster typically also shows OOM (out of memory) errors. To resolve these, you need to increase the amount of heap memory, which is half of the amount of memory allocated to a cluster. You might also detect OOM in plan changes via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `127`, `137`, and `158`. - -Check the [{{es}} cluster size](../../../deploy-manage/deploy/elastic-cloud/ech-customize-deployment-components.md#ech-cluster-size) and the [JVM memory pressure indicator](/deploy-manage/monitor/monitoring-data/ec-memory-pressure.md) documentation to learn more. - -As well, you can read our detailed blog [Managing and troubleshooting {{es}} memory](https://www.elastic.co/blog/managing-and-troubleshooting-elasticsearch-memory). - - -## Existing index [ech-config-change-errors-existing-index] - -In rare cases, when you attempt to upgrade the version of a deployment and the upgrade fails on the first attempt, subsequent attempts to upgrade may fail due to already existing resources. The problem may be due to the system preventing itself from overwriting existing indices, resulting in an error such as this: `Another Kibana instance appears to be migrating the index. Waiting for that migration to complete. If no other Kibana instance is attempting migrations, you can get past this message by deleting index .kibana_2 and restarting Kibana`. - -To resolve this: - -1. Check that you don’t need the content. -2. Run an {{es}} [Delete index request](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete) to remove the existing index. - - In this example, the `.kibana_2` index is the rollover of saved objects (such as Kibana visualizations or dashboards) from the original `.kibana_1` index. Since `.kibana_2` was created as part of the failed upgrade process, this index does not yet contain any pertinent data and it can safely be deleted. - -3. Retry the deployment configuration change. - - -## Insufficient Storage [ech-config-change-errors-insufficient-storage] - -Configuration change errors can occur when there is insufficient disk space for a data tier. To resolve this, you need to increase the size of that tier to ensure it provides enough storage to accommodate the data in your cluster tier considering the [high watermark](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html#disk-based-shard-allocation). For troubleshooting walkthrough, see [Fix watermark errors](https://www.elastic.co/guide/en/elasticsearch/reference/current/fix-watermark-errors.html). - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-cpu-usage-exceed-allowed-threshold.md b/raw-migrated-files/cloud/cloud-heroku/ech-cpu-usage-exceed-allowed-threshold.md deleted file mode 100644 index 9c3d0a5416..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-cpu-usage-exceed-allowed-threshold.md +++ /dev/null @@ -1,82 +0,0 @@ -# CPU usage exceeds the allowed threshold on master nodes [ech-cpu-usage-exceed-allowed-threshold] - -**Health check** - -By default, the allowed CPU usage threshold is set at 85%. - -1. Log in to the [Elasticsearch Add-On for Heroku console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-heroku-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. Identify the IDs of your master nodes. On your deployment page, scroll down to **Instances** and filter your instance configuration by master. The IDs of your master nodes are in the title. In this example, the IDs are 21, 26 and 27: - - :::{image} ../../../images/cloud-heroku-ec-instances-filtered-by-master-id.png - :alt: Instances configuration filtered by master nodes ID - ::: - - ::::{note} - The name of the instance configuration might differ depending on the cloud provider. - :::: - -4. Navigate to the **Performance** page of your deployment. Check if the CPU usage of your master nodes exceeds 85%. Your master node has the format `instance-``, where ``` is the ID of the master node. - -If you use [Stack Monitoring](https://www.elastic.co/guide/en/kibana/current/xpack-monitoring.html), open Kibana from your deployment page and select **Stack Monitoring** from the menu or the search bar. - -::::{note} -Stack Monitoring comes with out-of-the-box rules, but you need to enable them when prompted. -:::: - - -**Possible causes** - -* The master node is overwhelmed by a large number of snapshots or shards. -* The memory available on the master node is overwhelmed by these tasks: - - * External tasks initiated by clients - - * Index, search, update - * Frequent template updates due to the Beats configuration - - * Internal tasks initiated by users - - * Machine Learning jobs, watches, monitoring, ingest pipelines - - * Internal tasks initiated by {{es}} - - * Nodes joining and leaving due to hardware failures - * Shard allocation due to nodes joining and leaving - * Configuration of ILM policies. - - -**Resolutions** - -* Navigate to the **Edit** page of your deployment and increase the master node size. -* [Upgrade the cluster](../../../deploy-manage/upgrade/deployment-or-cluster.md) to the latest version. -* If the master node is overwhelmed by external tasks initiated by clients: - - * Reduce the request rate or pause ingesting, searching, or updating from the client. - * Enable ingest and search-based autoscaling. - * Stop Beats to avoid frequent template updates. - -* If the master node is overwhelmed by internal tasks initiated by users: - - * Check [cluster-level pending tasks](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-pending-tasks). - * Reduce the number of Machine Learning jobs or watches. - * Change the number of ingest pipelines or processors to use less memory. - -* If the master node is overwhelmed by internal tasks initiated by {{es}}: - - * For nodes joining and leaving, this should resolve itself. If increasing the master nodes size doesn’t resolve the issue, contact support. - * For shard allocation, inspect the progress of shards recovery. If there’s no progress, contact support. - - * Make sure `indices.recovery.max_concurrent_operations` is not aggressive, which could cause the master to be unavailable. - * Make sure `indices.recovery.max_bytes_per_sec` is set adequately to avoid impact on ingest and search workload. - - * Check ILM policies to avoid index rollover and relocate actions that are concurrent and aggressive. - -* If the master node is overwhelmed by a large number of snapshots, reduce the number of snapshots in the repo. -* If the master node is overwhelmed by a large number of shards, reduce the number of shards on the node. For more information, check [Size your shards](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md). - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-deployment-no-op.md b/raw-migrated-files/cloud/cloud-heroku/ech-deployment-no-op.md deleted file mode 100644 index 02d5530e5c..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-deployment-no-op.md +++ /dev/null @@ -1,21 +0,0 @@ -# How do I resolve deployment health warnings? [ech-deployment-no-op] - -The Elasticsearch Add-On for Heroku [Deployments](https://cloud.elastic.co/deployments) page shows the current status of your active deployments. From time to time you may get one or more health warnings, such as the following: - -:::{image} ../../../images/cloud-heroku-ec-ce-deployment-health-warning.png -:alt: A screen capture of the deployment page showing a typical warning: Deployment health warning: Latest change to {{es}} configuration failed. -::: - -**Seeing only one warning?** - -To resolve a single health warning, we recommended first re-applying any pending changes: Select **Edit** in the deployment menu to open the Edit page and then click **Save** without making any changes. This will check all components for pending changes and will apply the changes as needed. This may impact the uptime of clusters which are not [highly available](../../../deploy-manage/production-guidance/plan-for-production-elastic-cloud.md#ech-ha). - -Re-saving the deployment configuration without making any changes is often all that’s needed to resolve a transient health warning on the UI. Saving will redirect you to the Elasticsearch Add-On for Heroku deployment [Activity page](../../../deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) where you can monitor plan completion. Repeat errors should be investigated; for more information refer to [resolving configuration change errors](../../../troubleshoot/monitoring/node-bootlooping.md). - -**Seeing multiple warnings?** - -If multiple health warnings appear for one of your deployments, or if your deployment is unhealthy, we recommend [Getting help](../../../deploy-manage/deploy/elastic-cloud/ech-get-help.md) through the Elastic Support Portal. - -**Warning about system changes** - -If the warning refers to a system change, check the deployment’s [Activity](../../../deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) page. diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-jvm-heap-usage-exceed-allowed-threshold.md b/raw-migrated-files/cloud/cloud-heroku/ech-jvm-heap-usage-exceed-allowed-threshold.md deleted file mode 100644 index ca4c39c7da..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-jvm-heap-usage-exceed-allowed-threshold.md +++ /dev/null @@ -1,63 +0,0 @@ -# JVM heap usage exceeds the allowed threshold on master nodes [ech-jvm-heap-usage-exceed-allowed-threshold] - -**Health check** - -1. Log in to the [Elasticsearch Add-On for Heroku console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-heroku-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. On your deployment page, scroll down to **Instances** and check if the JVM memory pressure for your {{es}} instances is high. - - :::{image} ../../../images/cloud-heroku-ec-deployment-instances-config.png - :alt: Deployment instances configuration - ::: - - -**Possible causes** - -* The master node is overwhelmed by a large number of snapshots or shards. - - * External tasks initiated by clients - - * Index, search, update - * Frequent template updates due to the Beats configuration - - * Internal tasks initiated by users - - * Machine Learning jobs, watches, monitoring, ingest pipeline - - * Internal tasks initiated by {{es}} - - * Nodes joining and leaving due to hardware failures - * Shard allocation due to nodes joining and leaving - * Configuration of ILM policies. - - -**Resolutions** - -* If the master node is overwhelmed by external tasks initiated by clients: - - Investigate which clients might be overwhelming the cluster and reduce the request rate or pause ingesting, searching, or updating from the client. If you are using Beats, temporarily stop the Beat that’s overwhelming the cluster to avoid frequent template updates. - -* If the master node is overwhelmed by internal tasks initiated by users: - - * Check [cluster-level pending tasks](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-pending-tasks). - * Reduce the number of Machine Learning jobs or watches. - * Change the number of ingest pipelines or processors to use less memory. - -* If the master node is overwhelmed by internal tasks initiated by {{es}}: - - * For nodes joining and leaving, this should resolve itself. If increasing the master nodes size doesn’t resolve the issue, contact support. - * For shard allocation, inspect the progress of shards recovery. - - * Make sure `indices.recovery.max_concurrent_operations` is not aggressive, which could cause the master to be unavailable. - * Make sure `indices.recovery.max_bytes_per_sec` is set adequately to avoid impact on ingest and search workload. - - * Check ILM policies to avoid index rollover and relocate actions that are concurrent and aggressive. - -* If the master node is overwhelmed by a large number of snapshots, reduce the number of snapshots in the repo. -* If the master node is overwhelmed by a large number of shards, delete unneeded indices and shrink read-only indices to fewer shards. For more information, check [Reduce a cluster’s shard count](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md#reduce-cluster-shard-count). - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-multiple-node-deployment-disk-used.md b/raw-migrated-files/cloud/cloud-heroku/ech-multiple-node-deployment-disk-used.md deleted file mode 100644 index 75742ee36e..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-multiple-node-deployment-disk-used.md +++ /dev/null @@ -1,58 +0,0 @@ -# Full disk on multiple-nodes deployment [ech-multiple-node-deployment-disk-used] - -**Health check** - -1. Log in to the [Elasticsearch Add-On for Heroku console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-heroku-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. On your deployment page, scroll down to **Instances** and check if the disk allocation for any of your {{es}} instances is over 90%. - - :::{image} ../../../images/cloud-heroku-ec-full-disk-multiple-nodes.png - :alt: Full disk on multiple-nodes deployment - ::: - - -**Possible cause** - -* The available storage is insufficient for the amount of ingested data. - -**Resolution** - -* [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). -* Increase the disk size (scale up). - -::::{note} -If your {{es}} cluster is unhealthy and reports a status of red, the scale up configuration change to increasing disk size on the affected data tiers may fail. You might need to delete some data so the configuration can be edited. If you want to increase your disk size without deleting data, then [reach out to Elastic support](../../../deploy-manage/deploy/elastic-cloud/ech-get-help.md) and we will assist you with scaling up. -:::: - - -**Preventions** - -* Increase the disk size (scale up). - - 1. On your deployment page, scroll down to **Instances** and identify the node attribute of the instances that are running out of disk space. - - :::{image} ../../../images/cloud-heroku-ec-node-attribute.png - :alt: Instance node attribute - ::: - - 2. Use the node types identified at step 1 to find out the corresponding data tier. - - :::{image} ../../../images/cloud-heroku-ec-node-types-data-tiers.png - :alt: Node type and corresponding attribute - ::: - - 3. From your deployment menu, go to the **Edit** page and increase the **Size per zone** for the data tiers identified at step 2. - - :::{image} ../../../images/cloud-heroku-ec-increase-size-per-zone.png - :alt: Increase size per zone - ::: - -* Enable [autoscaling](../../../deploy-manage/autoscaling.md) to grow your cluster automatically when it runs out of space. -* Configure (ILM) policies to automatically delete unused data. -* Enable [data tiers](../../../manage-data/lifecycle/data-tiers.md) to move older data that you don’t query often to more cost-effective storage. - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-nodes-unavailable-missing.md b/raw-migrated-files/cloud/cloud-heroku/ech-nodes-unavailable-missing.md deleted file mode 100644 index e10bee7072..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-nodes-unavailable-missing.md +++ /dev/null @@ -1,30 +0,0 @@ -# Some nodes are unavailable and are displayed as missing [ech-nodes-unavailable-missing] - -**Health check** - -* Use the [Metrics inventory](https://www.elastic.co/guide/en/observability/current/monitor-infrastructure-and-hosts.html) to identify unavailable or unhealthy nodes. If the number of minimum master nodes is down, {{es}} is not available. - -**Possible causes** - -* Hardware issue. -* Routing has stopped because of a previous ES configuration failure. -* Disk/memory/CPU are saturated. -* The network is saturated or disconnected. -* Nodes are unable to join. - -**Resolutions** - -* Hardware issue: Any unhealthy hardware detected by the platform is automatically vacated within the hour. If this doesn’t happen, contact support. -* Routing stopped: A failed {{es}} configuration might stop the nodes routing. Restart the routing manually to bring the node back to health. -* Disk/memory/CPU saturated: - - * [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). - * Increase disk size. - * [Enable autoscaling](../../../deploy-manage/autoscaling.md). - * Configuration of ILM policies. - * [Manage data tiers](../../../manage-data/lifecycle/data-tiers.md). - -* Network saturated or disconnected: Contact support. -* Nodes unable to join: Fix the {{es}} configuration. -* Nodes unable to join: Contact support. - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-remediate-issues-allocation-explain-API.md b/raw-migrated-files/cloud/cloud-heroku/ech-remediate-issues-allocation-explain-API.md deleted file mode 100644 index 56582c692e..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-remediate-issues-allocation-explain-API.md +++ /dev/null @@ -1,142 +0,0 @@ -# Remediate common issues returned by the cluster allocation explain API [ech-remediate-issues-allocation-explain-API] - -Here’s how to resolve the most common causes of unassigned shards reported by the cluster allocation explain API. - -* [Disk is full](../../../troubleshoot/monitoring/unavailable-shards.md#ech-disk-full) -* [A node containing data has moved to a different host](../../../troubleshoot/monitoring/unavailable-shards.md#ech-node-moved-to-another-host) -* [Unable to assign shards based on the allocation rule](../../../troubleshoot/monitoring/unavailable-shards.md#ech-cannot-assign-shards-on-allocation-rule) -* [The number of eligible data nodes is less than the number of replicas](../../../troubleshoot/monitoring/unavailable-shards.md#ech-eligible-data-nodes-less-than-replicas) -* [A snapshot issue prevents searchable snapshot indices from being allocated](../../../troubleshoot/monitoring/unavailable-shards.md#ech-searchable-snapshot-indices-not-allocated) -* [Maximum retry times exceeded](../../../troubleshoot/monitoring/unavailable-shards.md#ech-max-retry-exceeded) -* [Max shard per node reached the limit](../../../troubleshoot/monitoring/unavailable-shards.md#ech-max-shard-per-node) - -If your issue is not addressed here, then [contact Elastic support for help](../../../deploy-manage/deploy/elastic-cloud/ech-get-help.md). - -## Disk is full [ech-disk-full] - -**Symptom** - -If the disk usage exceeded the threshold, you may get one or more of the following messages: - -`the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=90%], using more disk space than the maximum allowed [90.0%], actual free: [9.273781776428223%]` - -`unable to force allocate shard to [%s] during replacement, as allocating to this node would cause disk usage to exceed 100%% ([%s] bytes above available disk space)` - -`the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=85%], using more disk space than the maximum allowed [85.0%], actual free: [14.119771122932434%]` - -`after allocating [[restored-xxx][0], node[null], [P], recovery_source[snapshot recovery [Om66xSJqTw2raoNyKxsNWg] from xxx/W5Yea4QuR2yyZ4iM44fumg], s[UNASSIGNED], unassigned_info[[reason=NEW_INDEX_RESTORED], at[2022-03-02T10:56:58.210Z], delayed=false, details[restore_source[xxx]], allocation_status[fetching_shard_data]]] node [GTXrECDRRmGkkAnB48hPqw] would have more than the allowed 10% free disk threshold (8.7% free), preventing allocation` - -**Resolutions** - -Review the topic for your deployment architecture: - -* [Full disk on single-node deployment](../../../troubleshoot/monitoring/unavailable-nodes.md) -* [Full disk on multiple-nodes deployment](../../../troubleshoot/monitoring/unavailable-nodes.md) - -To learn more, review the following topics: - -* [Cluster-level shard allocation and routing settings](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html) -* [Fix watermark errors](../../../troubleshoot/elasticsearch/fix-watermark-errors.md) - - -## A node containing data has moved to a different host [ech-node-moved-to-another-host] - -**Symptom** - -During the routine system maintenance performed by Elastic, it might happen that a node moves to a different host. If the indices are not configured with replica shards, the shard data on the {{es}} node that is moved will be lost, and you might get one or more of these messages: - -`cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster` - -**Resolutions** - -Configure an [highly available cluster](../../../deploy-manage/production-guidance/plan-for-production-elastic-cloud.md) to keep your service running. Also, consider taking the following actions to bring your deployment back to health and recover your data from the snapshot. - -* [Close the red indices](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-close) -* [Restore the indices](../../../deploy-manage/tools/snapshot-and-restore.md) from the last successful snapshot - -For more information, check also [Designing for resilience](../../../deploy-manage/production-guidance/availability-and-resilience.md). - - -## Unable to assign shards based on the allocation rule [ech-cannot-assign-shards-on-allocation-rule] - -**Symptom** - -When shards cannot be assigned, due to [data tier allocation](../../../manage-data/lifecycle/data-tiers.md#data-tier-allocation) or [attribute-based allocation](../../../deploy-manage/distributed-architecture/shard-allocation-relocation-recovery/index-level-shard-allocation.md), you might get one or more of these messages: - -`node does not match index setting [index.routing.allocation.include] filters [node_type:\"cold\"]` - -`index has a preference for tiers [data_cold] and node does not meet the required [data_cold] tier` - -`index has a preference for tiers [data_cold,data_warm,data_hot] and node does not meet the required [data_cold] tier` - -`index has a preference for tiers [data_warm,data_hot] and node does not meet the required [data_warm] tier` - -`this node's data roles are exactly [data_frozen] so it may only hold shards from frozen searchable snapshots, but this index is not a frozen searchable snapshot` - -**Resolutions** - -* Make sure nodes are available in each data tier and have sufficient disk space. -* [Check the index settings](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-indices) and ensure shards can be allocated to the expected data tier. -* Check the [ILM policy](../../../manage-data/lifecycle/index-lifecycle-management.md) and check for issues with the [allocate action](https://www.elastic.co/guide/en/elasticsearch/reference/current/ilm-allocate.html). -* Inspect the [index templates](../../../manage-data/data-store/templates.md) and check for issues with the index settings. - - -## The number of eligible data nodes is less than the number of replicas [ech-eligible-data-nodes-less-than-replicas] - -**Symptom** - -Unassigned replica shards are often caused by there being fewer eligible data nodes than the configured number_of_replicas. - -**Resolutions** - -* Add more [eligible data nodes or more availability zones](../../../deploy-manage/deploy/elastic-cloud/ech-customize-deployment-components.md) to ensure resiliency. -* Adjust the `number_of_replicas` [setting](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-put-settings) for your indices to the number of eligible data nodes -1. - - -## A snapshot issue prevents searchable snapshot indices from being allocated [ech-searchable-snapshot-indices-not-allocated] - -**Symptom** - -Some snapshots operations might be impacted, as shown in the following example: - -`failed shard on node [Yc_Jbf73QVSVYSqZT8HPlA]: failed recovery, failure RecoveryFailedException[[restored-my_index-2021.32][1]: … SnapshotMissingException[[found-snapshots:2021.08.25-my_index-2021.32-default_policy-_j2k8it9qnehe1t-2k0u6a/iOAoyjWLTyytKkW3_wF1jw] is missing]; nested: NoSuchFileException[Blob object [snapshots/52bc3ae2030a4df8ab10559d1720a13c/indices/WRlkKDuPSLW__M56E8qbfA/1/snap-iOAoyjWLTyytKkW3_wF1jw.dat] not found: The specified key does not exist. (Service: Amazon S3; Status Code: 404; Error Code: NoSuchKey; Request ID: 4AMTM1XFMTV5F00V; S3 Extended Request ID:` - -**Resolutions** - -Upgrade to {{es}} version 7.17.0 or later, which resolves bugs that affected snapshot operations in earlier versions. Check [Upgrade versions](../../../deploy-manage/upgrade/deployment-or-cluster.md) for more details. - -If you can’t upgrade, you can recreate the snapshot repository as a workaround. - -The bugs also affect searchable snapshots. If you still have data in the cluster but cannot restore from the searchable snapshot, you can try reindexing and recreating the searchable snapshot: - -* Reindex all the affected indices to new regular indices -* Remove the affected frozen indices -* Take the snapshot and mount the indices again - - -## Max shard per node reached the limit [ech-max-shard-per-node] - -**Symptom** - -The parameter [`cluster.max_shards_per_node`](https://www.elastic.co/guide/en/elasticsearch/reference/current/misc-cluster-settings.html#cluster-max-shards-per-node) limits the total number of primary and replica shards for the cluster. If your cluster has a number of shards beyond this limit, you might get the following message: - -`Validation Failed: 1: this action would add [2] shards, but this cluster currently has [1000]/[1000] maximum normal shards open` - -**Resolutions** - -Delete unnecessary indices, add more data nodes, and [avoid oversharding](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md) as too many shards can overwhelm your cluster. If you cannot take these actions, and you’re confident your changes won’t destabilize the cluster, you can temporarily increase the limit using the [cluster update settings API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-put-settings) and retry the action. For more details, check [Troubleshoot shard-related errors](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md#troubleshoot-shard-related-errors). - - -## Maximum retry times exceeded [ech-max-retry-exceeded] - -**Symptom** - -The cluster will attempt to allocate a shard a few times, before giving up and leaving the shard unallocated. On {{es}} Service, `index.allocation.max_retries` defaults to 5. If allocation fails after the maximum number of retries, you might get the following message: - -`shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [%s]` - -**Resolutions** - -Run [`POST /_cluster/reroute?retry_failed=true`](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-reroute) API to retry. If it still fails, rerun the [Cluster allocation explain](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-allocation-explain) API to diagnose the problem. - - diff --git a/raw-migrated-files/cloud/cloud-heroku/ech-single-node-deployment-disk-used.md b/raw-migrated-files/cloud/cloud-heroku/ech-single-node-deployment-disk-used.md deleted file mode 100644 index c7211e9083..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/ech-single-node-deployment-disk-used.md +++ /dev/null @@ -1,46 +0,0 @@ -# Full disk on single-node deployment [ech-single-node-deployment-disk-used] - -**Health check** - -1. Log in to the [Elasticsearch Add-On for Heroku console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the Elasticsearch Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-heroku-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. On your deployment page, scroll down to **Instances** and check if the disk allocation for your {{es}} instance is over 90%. - - :::{image} ../../../images/cloud-heroku-ec-full-disk-single-node.png - :alt: Full disk on single-node deployment - ::: - - -**Possible cause** - -* The available storage is insufficient for the amount of ingested data. - -**Resolution** - -* [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). -* Increase the disk size on your Hot data and Content tier (scale up). - -::::{note} -If your {{es}} cluster is unhealthy and reports a status of red, then increasing the disk size of your Hot data and Content tier may fail. You might need to delete some data so the configuration can be edited. If you want to increase your disk size without deleting data, then [reach out to Elastic support](../../../deploy-manage/deploy/elastic-cloud/ech-get-help.md) and we will assist you with scaling up. -:::: - - -**Preventions** - -* Increase the disk size on your Hot data and Content tier (scale up). - - From your deployment menu, go to the **Edit** page and increase the **Size per zone** for your Hot data and Content tiers. - - :::{image} ../../../images/cloud-heroku-ec-increase-size-per-zone.png - :alt: Increase size per zone - ::: - -* Enable [autoscaling](../../../deploy-manage/autoscaling.md) to grow your cluster automatically when it runs out of space. -* Configure (ILM) policies to automatically delete unused data. -* Add nodes to your {{es}} cluster and enable [data tiers](../../../manage-data/lifecycle/data-tiers.md) to move older data that you don’t query often to more cost-effective storage. - diff --git a/raw-migrated-files/cloud/cloud-heroku/echscenario_is_my_cluster_really_highly_available.md b/raw-migrated-files/cloud/cloud-heroku/echscenario_is_my_cluster_really_highly_available.md deleted file mode 100644 index 27ba768023..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/echscenario_is_my_cluster_really_highly_available.md +++ /dev/null @@ -1,16 +0,0 @@ -# Is my cluster really highly available? [echscenario_is_my_cluster_really_highly_available] - -You created a new cluster in Elasticsearch Add-On for Heroku that uses three availability zones and index replicas, because you want to use the [cluster for production](../../../deploy-manage/production-guidance/plan-for-production-elastic-cloud.md#ech-ha). It’s a mission-critical deployment and you need it to be able to handle user requests at all times. Your cluster has been up and running for some time and it seems to handle its workload well. But is this cluster really highly available, given its current workload? - -To answer this question, let’s take a look at CPU usage in the **Cluster Performance Metrics** section in the [Elasticsearch Add-On for Heroku console](https://cloud.elastic.co?page=docs&placement=docs-body): - -:::{image} ../../../images/cloud-heroku-metrics-cpu.png -:alt: CPU usage over time -::: - -Cluster performance metrics are shown per node and are color-coded to indicate which running {{es}} instance they belong to. In this case, you can notice that, from about 22:05 until just before 22:30, two out of three nodes are consistently close to maxing out their CPU resources at 100%. The third node seems to average somewhere under the 50% mark most of the time. - -This CPU usage graph indicates that your cluster is load-balancing between the nodes in the different availability zones as designed, but the workload is too high to be able to handle the loss of an availability zone. For a cluster to be able to handle the failure of a node, it should be considered at capacity when it uses 50% of its resources. In this case, two of the nodes are already maxed out and the third one is around 50%. If any one of the three nodes were to fail, the volume of user requests would overwhelm the remaining nodes. On smaller clusters up to and including 8 GB of RAM, CPU boosting can temporarily relieve some of the pressure, but you should not rely on this feature for high availability. On larger clusters, CPU boosting is not available. - -Even if your cluster is performing well, you still need to make sure that there is sufficient spare capacity to deal with the outage of an entire availability zone. For this cluster to remain highly available at all times, you either need to increase its size or reduce its workload. - diff --git a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md b/raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md deleted file mode 100644 index c1e6f15fd4..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md +++ /dev/null @@ -1,16 +0,0 @@ -# Why are my cluster response times suddenly so much worse? [echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse] - -Your {{es}} cluster is humming along nicely with good performance until you suddenly notice that response times increase substantially, for both index response times and search response times. The cluster is slow to respond for about 10 minutes, after which performance returns to a normal level. - -Initially, you think that perhaps memory pressure is to blame, because you already know that [high memory pressure can cause performance issues](../../../troubleshoot/monitoring/high-memory-pressure.md). You look at the **Cluster Performance Metrics** section of the [Elasticsearch Add-On for Heroku console](https://cloud.elastic.co?page=docs&placement=docs-body) and, after some zooming in to the right time frame, you get these metrics: - -:::{image} ../../../images/cloud-heroku-metrics-response-times.png -:alt: Cluster performance metrics -::: - -Memory pressure is not the culprit. The **Memory Pressure per Node** metric is always well below 75%, and there is virtually no garbage collection overhead, which is consistent with low memory pressure. Similarly, CPU usage spiked and caused CPU boosting to kick in, but there were more than enough CPU credits to sustain the CPU usage spikes to over 300%. The cluster was not constrained by CPU resources, either. - -So what caused the sudden increase in response times? The key to the puzzle lies in the **Number of Requests** metric, which indicates the number of requests that a cluster receives per second. Beginning shortly before 13:32, there was a substantial increase in the number of user requests per second. The number of requests per second continued to rise until the requests began to plateau as your cluster reached its maximum throughput, which in turn caused response times to rise. The number of requests remained at a high level for approximately five minutes, until they started to drop off again around 13:40. Overall, the sustained increase of user requests lasted a bit over 10 minutes, consistent with the slowdown you observed. - -This cluster was sized to handle a certain number of user requests. As the user requests exceeded the maximum throughput that a cluster of this size could sustain, response times increased. To avoid such a slowdown, you either need to control the volume of user requests that reaches the {{es}} cluster or you need to size your cluster to be able to accommodate a sudden increase in user requests. - diff --git a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_shards_unavailable.md b/raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_shards_unavailable.md deleted file mode 100644 index 0eaeccef21..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_shards_unavailable.md +++ /dev/null @@ -1,29 +0,0 @@ -# Why are my shards unavailable? [echscenario_why_are_shards_unavailable] - -This section describes how to analyze unassigned shards using the Elasticsearch APIs and Kibana. - -* [Analyze unassigned shards using the Elasticsearch API](../../../troubleshoot/monitoring/unavailable-shards.md) -* [Analyze unassigned shards using the Kibana UI](../../../troubleshoot/monitoring/unavailable-shards.md) -* [Remediate common issues returned by the cluster allocation explain API](../../../troubleshoot/monitoring/unavailable-shards.md) - -{{es}} distributes the documents in an index across multiple shards and distributes copies of those shards across multiple nodes in the cluster. This both increases capacity and makes the cluster more resilient, ensuring your data remains available if a node goes down. - -A healthy (green) cluster has a primary copy of each shard and the required number of replicas are assigned to different nodes in the cluster. - -If a cluster has unassigned replica shards, it is functional but vulnerable in the event of a failure. The cluster is unhealthy and reports a status of yellow. - -If a cluster has unassigned primary shards, some of your data is unavailable. The cluster is unhealthy and reports a status of red. - -A formerly-healthy cluster might have unassigned shards because nodes have dropped out or moved, are running out of disk space, or are hitting allocation limits. - -If a cluster has unassigned shards, you might see an error message such as this on the Elastic Cloud console: - -:::{image} ../../../images/cloud-heroku-ec-unhealthy-deployment.png -:alt: Unhealthy deployment error message -::: - -If your issue is not addressed here, then [contact Elastic support for help](../../../deploy-manage/deploy/elastic-cloud/ech-get-help.md). - - - - diff --git a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_my_node_unavailable.md b/raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_my_node_unavailable.md deleted file mode 100644 index dc76eadea7..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_my_node_unavailable.md +++ /dev/null @@ -1,26 +0,0 @@ -# Diagnose unavailable nodes [echscenario_why_is_my_node_unavailable] - -This section provides a list of common symptoms and possible actions that you can take to resolve issues when one or more nodes become unhealthy or unavailable. This guide is particularly useful if you are not [shipping your logs and metrics](../../../deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md) to a dedicated monitoring cluster. - -**What are the symptoms?** - -* [Full disk on single-node deployment](../../../troubleshoot/monitoring/unavailable-nodes.md) -* [Full disk on multiple-nodes deployment](../../../troubleshoot/monitoring/unavailable-nodes.md) -* [JVM heap usage exceeds the allowed threshold on master nodes](../../../troubleshoot/monitoring/unavailable-nodes.md) -* [CPU usage exceeds the allowed threshold on master nodes](../../../troubleshoot/monitoring/unavailable-nodes.md) -* [Some nodes are unavailable and are displayed as missing](../../../troubleshoot/monitoring/unavailable-nodes.md) - -**What is the impact?** - -* Only some search results are successful -* Ingesting, updating, and deleting data do not work -* Most {{es}} API requests fail - -::::{note} -Some actions described here, such as stopping indexing or Machine Learning jobs, are temporary remediations intended to get your cluster into a state where you can make configuration changes to resolve the issue. -:::: - - -For production deployments, we recommend setting up a dedicated monitoring cluster to collect metrics and logs, troubleshooting views, and cluster alerts. - -If your issue is not addressed here, then [contact Elastic support for help](../../../deploy-manage/deploy/elastic-cloud/ech-get-help.md). diff --git a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_performance_degrading_over_time.md b/raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_performance_degrading_over_time.md deleted file mode 100644 index 02f621568d..0000000000 --- a/raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_performance_degrading_over_time.md +++ /dev/null @@ -1,14 +0,0 @@ -# Why is performance degrading over time? [echscenario_why_is_performance_degrading_over_time] - -You have a smaller {{es}} cluster and you’ve noticed that performance seems to have declined recently. The response time during searches seems to have gone up, and overall the system just doesn’t seem to perform quite as well as it used to. You have already looked at the cluster performance metrics and have confirmed that both index and search response times have increased steadily and remained higher than before. So what explains the performance degradation? - -When you look in the **Cluster Performance Metrics** section of the [Elasticsearch Add-On for Heroku console](https://cloud.elastic.co?page=docs&placement=docs-body), you get the following metrics: - -:::{image} ../../../images/cloud-heroku-metrics-credits.png -:alt: CPU usage versus CPU credits over time -::: - -Between just after 00:10 and 00:20, excessively high CPU usage consumes all CPU credits until no more credits are available. CPU credits enable boosting the assigned CPU resources temporarily to improve performance on smaller clusters up to and including 8 GB of RAM when it is needed most, but CPU credits are by their nature limited. You accumulate CPU credits when you use less than your assigned share of CPU resources, and you consume credits when you use more CPU resources than assigned. As you max out your CPU resources, CPU credits permit your cluster to consume more than 100% of the assigned resources temporarily, which explains why CPU usage exceeds 100%, with usage peaks that reach well over 400% for one node. As CPU credits are depleted, CPU usage gradually drops until it returns to 100% at 00:30 when no more CPU credits are available. You can also notice that after 00:30 credits gradually begin to accumulate again. - -If you need your cluster to be able to sustain a certain level of performance, you cannot rely on CPU boosting to handle the workload except temporarily. To ensure that performance can be sustained, consider increasing the size of your cluster. - diff --git a/raw-migrated-files/cloud/cloud/ec-config-change-errors.md b/raw-migrated-files/cloud/cloud/ec-config-change-errors.md deleted file mode 100644 index 053f3c3a1f..0000000000 --- a/raw-migrated-files/cloud/cloud/ec-config-change-errors.md +++ /dev/null @@ -1,139 +0,0 @@ -# How do I resolve node bootlooping? [ec-config-change-errors] - -When you attempt to apply a configuration change to a deployment, the attempt may fail with an error indicating that the change could not be applied, and deployment resources may be unable to restart. In some cases, bootlooping may result, where the deployment resources cycle through a continual reboot process. - -:::{image} ../../../images/cloud-ec-ce-configuration-change-failure.png -:alt: A screen capture of the deployment page showing an error: Latest change to {{es}} configuration failed. -::: - -To help diagnose these and any other types of issues in your deployments, we recommend [setting up monitoring](../../../deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md). Then, you can easily view your deployment health and access log files to troubleshoot this configuration failure. - -To confirm if your Elasticsearch cluster is bootlooping, you can check the most recent plan under your [Deployment Activity page](../../../deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) for the error: - -```sh -Plan change failed: Some instances were unable to start properly. -``` - -If this occurs, correlating {{es}} logs should report: - -```sh -fatal exception while booting Elasticsearch -``` - -Following are some frequent causes of a failed configuration change: - -1. [Secure settings](../../../troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-secure-settings) -2. [Expired custom plugins or bundles](../../../troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-expired-bundle-extension) -3. [OOM errors](../../../troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-oom-errors) -4. [Existing index](../../../troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-existing-index) -5. [Insufficient Storage](../../../troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-insufficient-storage) - -If you’re unable to remediate the failing plan’s root cause, you can attempt to reset the deployment to the latest successful {{es}} configuration by performing a [no-op plan](../../../troubleshoot/monitoring/deployment-health-warnings.md). For an example, see this [video walkthrough](https://www.youtube.com/watch?v=8MnXZ9egBbQ). - - -## Secure settings [ec-config-change-errors-secure-settings] - -The most frequent cause of a failed deployment configuration change is due to invalid or mislocated [secure settings](../../../deploy-manage/security/secure-settings.md). This can frequently be discovered by searching {{es}} logs for one of the following error messages: - -```sh -IllegalStateException: security initialization failed -java.lang.IllegalArgumentException: unknown secure setting -``` - -These are settings typically added to the keystore for the purpose of: - -1. Setting up third-party authentication, for example [SAML](../../../deploy-manage/users-roles/cluster-or-deployment-auth/saml.md), [OpenID Connect](../../../deploy-manage/users-roles/cluster-or-deployment-auth/openid-connect.md), or [Kerberos](../../../deploy-manage/users-roles/cluster-or-deployment-auth/kerberos.md). -2. Setting up a [custom repository](../../../deploy-manage/tools/snapshot-and-restore/elastic-cloud-hosted.md). - -The keystore allows you to safely store sensitive settings, such as passwords, as a key/value pair. You can then access a secret value from a settings file by referencing its key. Importantly, not all settings can be stored in the keystore, and the keystore does not validate the settings that you add. Adding unsupported settings can cause {{es}} or other components to fail to restart. To check whether a setting is supported in the keystore, look for a "Secure" qualifier in the [lists of reloadable settings](../../../deploy-manage/security/secure-settings.md). - -The following sections detail some secure settings problems that can result in a configuration change error that can prevent a deployment from restarting. You might diagnose these plan failures via the logs or via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `1`, `3`, and `78`. - - -### Invalid or outdated values [ec-config-change-errors-old-values] - -The keystore does not validate any settings that you add, so invalid or outdated values are a common source of errors when you apply a configuration change to a deployment. - -To check the current set of stored settings: - -1. Open the deployment **Security** page. -2. In the **{{es}} keystore** section, check the **Security keys** list. The list is shown only if you currently have settings configured in the keystore. - -One frequent cause of errors is when settings in the keystore are no longer valid, such as when SAML settings are added for a test environment, but the settings are either not carried over or no longer valid in a production environment. - - -### Snapshot repositories [ec-config-change-errors-snapshot-repos] - -Sometimes, settings added to the keystore to connect to a snapshot repository may not be valid. When this happens, you may get an error such as `SettingsException[Neither a secret key nor a shared access token was set.]` - -For example, when adding an [Azure repository storage setting](../../../deploy-manage/tools/snapshot-and-restore/azure-repository.md#repository-azure-usage) such as `azure.client.default.account` to the keystore, the associated setting `azure.client.default.key` must also be added for the configuration to be valid. - - -### Third-party authentication [ec-config-change-errors-third-party-auth] - -When you configure third-party authentication, it’s important that all required configuration elements that are stored in the keystore are included in the {{es}} user settings file. For example, when you [create a SAML realm](../../../deploy-manage/users-roles/cluster-or-deployment-auth/saml.md#saml-create-realm), omitting a field such as `idp.entity_id` when that setting is present in the keystore results in a failed configuration change. - - -### Wrong location [ec-config-change-errors-wrong-location] - -In some cases, settings may accidentally be added to the keystore that should have been added to the [{{es}} user settings file](../../../deploy-manage/deploy/elastic-cloud/edit-stack-settings.md). It’s always a good idea to check the [lists of reloadable settings](../../../deploy-manage/security/secure-settings.md) to determine if a setting can be stored in the keystore. Settings that can safely be added to the keystore are flagged as `Secure`. - - -## Expired custom plugins or bundles [ec-config-change-errors-expired-bundle-extension] - -During the process of applying a configuration change, Elasticsearch Service checks to determine if any [uploaded custom plugins or bundles](../../../deploy-manage/deploy/elastic-cloud/upload-custom-plugins-bundles.md) are expired. - -Problematic plugins produce oscillating {{es}} start-up logs like the following: - -```sh -Booting at Sun Sep 4 03:06:43 UTC 2022 -Installing user plugins. -Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... -/app/elasticsearch.sh: line 169: [: too many arguments -Booting at Sun Sep 4 03:06:58 UTC 2022 -Installing user plugins. -Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... -/app/elasticsearch.sh: line 169: [: too many arguments -``` - -Problematic bundles produce similar oscillations but their install log would appear like - -```sh -2024-11-17 15:18:02 https://found-user-plugins.s3.amazonaws.com/XXXXX/XXXXX.zip?response-content-disposition=attachment%3Bfilename%XXXXX%2F4007535947.zip&x-elastic-extension-version=1574194077471&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20241016T133214Z&X-Amz-SignedHeaders=host&X-Amz-Expires=86400&XAmz-Credential=XXXXX%2F20201016%2Fus-east-1%2Fs3%2Faws4_request&X-AmzSignature=XXXXX -``` - -Noting in example that the bundle’s expiration `X-Amz-Date=20241016T133214Z` is before than the log timestamp `2024-11-17 15:18:02` so this bundle is considered expired. - -To view any added plugins or bundles: - -1. Go to the **Features** page and open the **Extensions** tab. -2. Select any extension and then choose **Update extension** to renew it. No other changes are needed, and any associated configuration change failures should now be able to succeed. - - -## OOM errors [ec-config-change-errors-oom-errors] - -Configuration change errors can occur when there is insufficient RAM configured for a data tier. In this case, the cluster typically also shows OOM (out of memory) errors. To resolve these, you need to increase the amount of heap memory, which is half of the amount of memory allocated to a cluster. You might also detect OOM in plan changes via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `127`, `137`, and `158`. - -Check the [{{es}} cluster size](../../../deploy-manage/deploy/elastic-cloud/ec-customize-deployment-components.md#ec-cluster-size) and the [JVM memory pressure indicator](../../../deploy-manage/monitor/monitoring-data/ec-memory-pressure.md) documentation to learn more. - -As well, you can read our detailed blog [Managing and troubleshooting {{es}} memory](https://www.elastic.co/blog/managing-and-troubleshooting-elasticsearch-memory). - - -## Existing index [ec-config-change-errors-existing-index] - -In rare cases, when you attempt to upgrade the version of a deployment and the upgrade fails on the first attempt, subsequent attempts to upgrade may fail due to already existing resources. The problem may be due to the system preventing itself from overwriting existing indices, resulting in an error such as this: `Another Kibana instance appears to be migrating the index. Waiting for that migration to complete. If no other Kibana instance is attempting migrations, you can get past this message by deleting index .kibana_2 and restarting Kibana`. - -To resolve this: - -1. Check that you don’t need the content. -2. Run an {{es}} [Delete index request](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete) to remove the existing index. - - In this example, the `.kibana_2` index is the rollover of saved objects (such as Kibana visualizations or dashboards) from the original `.kibana_1` index. Since `.kibana_2` was created as part of the failed upgrade process, this index does not yet contain any pertinent data and it can safely be deleted. - -3. Retry the deployment configuration change. - - -## Insufficient Storage [ec-config-change-errors-insufficient-storage] - -Configuration change errors can occur when there is insufficient disk space for a data tier. To resolve this, you need to increase the size of that tier to ensure it provides enough storage to accommodate the data in your cluster tier considering the [high watermark](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html#disk-based-shard-allocation). For troubleshooting walkthrough, see [Fix watermark errors](https://www.elastic.co/guide/en/elasticsearch/reference/current/fix-watermark-errors.html). - diff --git a/raw-migrated-files/cloud/cloud/ec-deployment-no-op.md b/raw-migrated-files/cloud/cloud/ec-deployment-no-op.md deleted file mode 100644 index b2e95ea6bb..0000000000 --- a/raw-migrated-files/cloud/cloud/ec-deployment-no-op.md +++ /dev/null @@ -1,21 +0,0 @@ -# How do I resolve deployment health warnings? [ec-deployment-no-op] - -The Elasticsearch Service [Deployments](https://cloud.elastic.co/deployments) page shows the current status of your active deployments. From time to time you may get one or more health warnings, such as the following: - -:::{image} ../../../images/cloud-ec-ce-deployment-health-warning.png -:alt: A screen capture of the deployment page showing a typical warning: Deployment health warning: Latest change to {{es}} configuration failed. -::: - -**Seeing only one warning?** - -To resolve a single health warning, we recommended first re-applying any pending changes: Select **Edit** in the deployment menu to open the Edit page and then click **Save** without making any changes. This will check all components for pending changes and will apply the changes as needed. This may impact the uptime of clusters which are not [highly available](../../../deploy-manage/production-guidance/plan-for-production-elastic-cloud.md#ec-ha). - -Re-saving the deployment configuration without making any changes is often all that’s needed to resolve a transient health warning on the UI. Saving will redirect you to the Elasticsearch Service deployment [Activity page](../../../deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) where you can monitor plan completion. Repeat errors should be investigated; for more information refer to [resolving configuration change errors](../../../troubleshoot/monitoring/node-bootlooping.md). - -**Seeing multiple warnings?** - -If multiple health warnings appear for one of your deployments, or if your deployment is unhealthy, we recommend [Getting help](../../../troubleshoot/index.md) through the Elastic Support Portal. - -**Warning about system changes** - -If the warning refers to a system change, check the deployment’s [Activity](../../../deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) page. diff --git a/raw-migrated-files/cloud/cloud/ec-scenario_is_my_cluster_really_highly_available.md b/raw-migrated-files/cloud/cloud/ec-scenario_is_my_cluster_really_highly_available.md deleted file mode 100644 index c491276a88..0000000000 --- a/raw-migrated-files/cloud/cloud/ec-scenario_is_my_cluster_really_highly_available.md +++ /dev/null @@ -1,16 +0,0 @@ -# Is my cluster really highly available? [ec-scenario_is_my_cluster_really_highly_available] - -You created a new cluster in Elasticsearch Service that uses three availability zones and index replicas, because you want to use the [cluster for production](../../../deploy-manage/production-guidance/plan-for-production-elastic-cloud.md#ec-ha). It’s a mission-critical deployment and you need it to be able to handle user requests at all times. Your cluster has been up and running for some time and it seems to handle its workload well. But is this cluster really highly available, given its current workload? - -To answer this question, let’s take a look at CPU usage in the **Cluster Performance Metrics** section in the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body): - -:::{image} ../../../images/cloud-metrics-cpu.png -:alt: CPU usage over time -::: - -Cluster performance metrics are shown per node and are color-coded to indicate which running {{es}} instance they belong to. In this case, you can notice that, from about 22:05 until just before 22:30, two out of three nodes are consistently close to maxing out their CPU resources at 100%. The third node seems to average somewhere under the 50% mark most of the time. - -This CPU usage graph indicates that your cluster is load-balancing between the nodes in the different availability zones as designed, but the workload is too high to be able to handle the loss of an availability zone. For a cluster to be able to handle the failure of a node, it should be considered at capacity when it uses 50% of its resources. In this case, two of the nodes are already maxed out and the third one is around 50%. If any one of the three nodes were to fail, the volume of user requests would overwhelm the remaining nodes. On smaller clusters up to and including 8 GB of RAM, CPU boosting can temporarily relieve some of the pressure, but you should not rely on this feature for high availability. On larger clusters, CPU boosting is not available. - -Even if your cluster is performing well, you still need to make sure that there is sufficient spare capacity to deal with the outage of an entire availability zone. For this cluster to remain highly available at all times, you either need to increase its size or reduce its workload. - diff --git a/raw-migrated-files/cloud/cloud/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md b/raw-migrated-files/cloud/cloud/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md deleted file mode 100644 index a238618606..0000000000 --- a/raw-migrated-files/cloud/cloud/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md +++ /dev/null @@ -1,16 +0,0 @@ -# Why are my cluster response times suddenly so much worse? [ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse] - -Your {{es}} cluster is humming along nicely with good performance until you suddenly notice that response times increase substantially, for both index response times and search response times. The cluster is slow to respond for about 10 minutes, after which performance returns to a normal level. - -Initially, you think that perhaps memory pressure is to blame, because you already know that [high memory pressure can cause performance issues](../../../troubleshoot/monitoring/high-memory-pressure.md). You look at the **Cluster Performance Metrics** section of the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body) and, after some zooming in to the right time frame, you get these metrics: - -:::{image} ../../../images/cloud-metrics-response-times.png -:alt: Cluster performance metrics -::: - -Memory pressure is not the culprit. The **Memory Pressure per Node** metric is always well below 75%, and there is virtually no garbage collection overhead, which is consistent with low memory pressure. Similarly, CPU usage spiked and caused CPU boosting to kick in, but there were more than enough CPU credits to sustain the CPU usage spikes to over 300%. The cluster was not constrained by CPU resources, either. - -So what caused the sudden increase in response times? The key to the puzzle lies in the **Number of Requests** metric, which indicates the number of requests that a cluster receives per second. Beginning shortly before 13:32, there was a substantial increase in the number of user requests per second. The number of requests per second continued to rise until the requests began to plateau as your cluster reached its maximum throughput, which in turn caused response times to rise. The number of requests remained at a high level for approximately five minutes, until they started to drop off again around 13:40. Overall, the sustained increase of user requests lasted a bit over 10 minutes, consistent with the slowdown you observed. - -This cluster was sized to handle a certain number of user requests. As the user requests exceeded the maximum throughput that a cluster of this size could sustain, response times increased. To avoid such a slowdown, you either need to control the volume of user requests that reaches the {{es}} cluster or you need to size your cluster to be able to accommodate a sudden increase in user requests. - diff --git a/raw-migrated-files/cloud/cloud/ec-scenario_why_are_shards_unavailable.md b/raw-migrated-files/cloud/cloud/ec-scenario_why_are_shards_unavailable.md deleted file mode 100644 index 58a3feaa1b..0000000000 --- a/raw-migrated-files/cloud/cloud/ec-scenario_why_are_shards_unavailable.md +++ /dev/null @@ -1,319 +0,0 @@ -# Why are my shards unavailable? [ec-scenario_why_are_shards_unavailable] - -This section describes how to analyze unassigned shards using the Elasticsearch APIs and Kibana. - -* [Analyze unassigned shards using the Elasticsearch API](../../../troubleshoot/monitoring/unavailable-shards.md#ec-analyze_shards_with-api) -* [Analyze unassigned shards using the Kibana UI](../../../troubleshoot/monitoring/unavailable-shards.md#ec-analyze_shards_with-kibana) -* [Remediate common issues returned by the cluster allocation explain API](../../../troubleshoot/monitoring/unavailable-shards.md#ec-remediate-issues-allocation-explain-API) - -{{es}} distributes the documents in an index across multiple shards and distributes copies of those shards across multiple nodes in the cluster. This both increases capacity and makes the cluster more resilient, ensuring your data remains available if a node goes down. - -A healthy (green) cluster has a primary copy of each shard and the required number of replicas are assigned to different nodes in the cluster. - -If a cluster has unassigned replica shards, it is functional but vulnerable in the event of a failure. The cluster is unhealthy and reports a status of yellow. - -If a cluster has unassigned primary shards, some of your data is unavailable. The cluster is unhealthy and reports a status of red. - -A formerly-healthy cluster might have unassigned shards because nodes have dropped out or moved, are running out of disk space, or are hitting allocation limits. - -If a cluster has unassigned shards, you might see an error message such as this on the Elastic Cloud console: - -:::{image} ../../../images/cloud-ec-unhealthy-deployment.png -:alt: Unhealthy deployment error message -::: - -If your issue is not addressed here, then [contact Elastic support for help](../../../troubleshoot/index.md). - -## Analyze unassigned shards using the {{es}} API [ec-analyze_shards_with-api] - -You can retrieve information about the status of your cluster, indices, and shards using the {{es}} API. To access the API you can either use the [Kibana Dev Tools Console](../../../explore-analyze/query-filter/tools/console.md), or the [Elasticsearch API console](https://www.elastic.co/guide/en/cloud/current/ec-api-console.html). If you have your own way to run the {{es}} API, check [How to access the API](https://www.elastic.co/guide/en/cloud/current/ec-api-access.html). This section shows you how to: - -* [Check cluster health](../../../troubleshoot/monitoring/unavailable-shards.md#ec-check-cluster-health) -* [Check unhealthy indices](../../../troubleshoot/monitoring/unavailable-shards.md#ec-check-unhealthy-indices) -* [Check which shards are unassigned](../../../troubleshoot/monitoring/unavailable-shards.md#ec-check-which-unassigned-shards) -* [Check why shards are unassigned](../../../troubleshoot/monitoring/unavailable-shards.md#ec-check-why-unassigned-shards) -* [Check Elasticsearch cluster logs](../../../troubleshoot/monitoring/unavailable-shards.md#ec-check-es-cluster-logs) - - -#### Check cluster health [ec-check-cluster-health] - -Use the [Cluster health API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-health): - -```json -GET /_cluster/health/ -``` - -This command returns the cluster status (green, yellow, or red) and shows the number of unassigned shards: - -```json -{ - "cluster_name" : "xxx", - "status" : "red", - "timed_out" : false, - "number_of_nodes" : "x", - "number_of_data_nodes" : "x", - "active_primary_shards" : 116, - "active_shards" : 229, - "relocating_shards" : 0, - "initializing_shards" : 0, - "unassigned_shards" : 1, - "delayed_unassigned_shards" : 0, - "number_of_pending_tasks" : 0, - "number_of_inflight_fetch" : 0, - "task_max_waiting_in_queue_millis" : 0, - "active_shards_percent_as_number" : 98.70689655172413 -} -``` - - -#### Check unhealthy indices [ec-check-unhealthy-indices] - -Use the [cat indices API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-indices) to get the status of individual indices. Specify the `health` parameter to limit the results to a particular status, for example `?v&health=red` or `?v&health=yellow`. - -```json -GET /_cat/indices?v&health=red -``` - -This command returns any indices that have unassigned primary shards (red status): - -```json -red open filebeat-7.10.0-2022.01.07-000014 C7N8fxGwRxK0JcwXH18zVg 1 1 -red open filebeat-7.9.3-2022.01.07-000015 Ib4UIJNVTtOg6ovzs011Lq 1 1 -``` - -For more information, refer to [Fix a red or yellow cluster status](../../../troubleshoot/elasticsearch/red-yellow-cluster-status.md#fix-red-yellow-cluster-status). - - -#### Check which shards are unassigned [ec-check-which-unassigned-shards] - -Use the [cat shards API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-shards): - -```json -GET /_cat/shards/?v -``` - -This command returns the index name, followed by the shard type and shard status: - -```json -filebeat-7.10.0-2022.01.07-000014 0 P UNASSIGNED -filebeat-7.9.3-2022.01.07-000015 1 P UNASSIGNED -filebeat-7.9.3-2022.01.07-000015 2 r UNASSIGNED -``` - - -#### Check why shards are unassigned [ec-check-why-unassigned-shards] - -To understand why shards are unassigned, run the [Cluster allocation explain API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-allocation-explain). - -Running the API call `GET _cluster/allocation/explain` retrieves an allocation explanation for unassigned primary shards, or replica shards. - -For example, if `_cat/health` shows that the primary shard of shard 1 in the `filebeat-7.9.3-2022.01.07-000015` index is unassigned, you can get the allocation explanation with the following request: - -```json -GET _cluster/allocation/explain -{ - "index": "filebeat-7.9.3-2022.01.07-000015", - "shard": 1, - "primary": true -} -``` - -The response is as follows: - -```json -{ - "index": "filebeat-7.9.3-2022.01.07-000015", - "shard": 1, - "primary": true, - "current_state": "unassigned", - "unassigned_info": { - "reason": "CLUSTER_RECOVERED", - "at": "2022-04-12T13:06:36.125Z", - "last_allocation_status": "no_valid_shard_copy" - }, - "can_allocate": "no_valid_shard_copy", - "allocate_explanation": "cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster", - "node_allocation_decisions": [ - { - "node_id": "xxxx", - "node_name": "instance-0000000005", - (... skip ...) - "node_decision": "no", - "store": { - "found": false - } - } - ] -} -``` - - -#### Check {{es}} cluster logs [ec-check-es-cluster-logs] - -To determine the allocation issue, you can [check the logs](../../../deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md#ec-check-logs). This is easier if you have set up a dedicated monitoring deployment. - - -## Analyze unassigned shards using the Kibana UI [ec-analyze_shards_with-kibana] - -If you are shipping logs and metrics to a monitoring deployment, go through the following steps. - -1. Select your deployment from the {{es}} Service panel and navigate to the **Logs and metrics** page. -2. Click **Enable**. -3. Choose the deployment where to send your logs and metrics. -4. Click **Save**. It might take a few minutes to apply the configuration changes. -5. Click **View** to open the Kibana UI and get more details on metrics and logs. - -:::{image} ../../../images/cloud-ec-logs-metrics-page.png -:alt: Log and metrics page -::: - -The unhealthy indices appear with a red or yellow status. - -:::{image} ../../../images/cloud-ec-red-yellow-indices.png -:alt: Unhealthy indices in red or yellow status -::: - - -## Remediate common issues returned by the cluster allocation explain API [ec-remediate-issues-allocation-explain-API] - -Here’s how to resolve the most common causes of unassigned shards reported by the cluster allocation explain API. - -* [Disk is full](../../../troubleshoot/monitoring/unavailable-shards.md#ec-disk-full) -* [A node containing data has moved to a different host](../../../troubleshoot/monitoring/unavailable-shards.md#ec-node-moved-to-another-host) -* [Unable to assign shards based on the allocation rule](../../../troubleshoot/monitoring/unavailable-shards.md#ec-cannot-assign-shards-on-allocation-rule) -* [The number of eligible data nodes is less than the number of replicas](../../../troubleshoot/monitoring/unavailable-shards.md#ec-eligible-data-nodes-less-than-replicas) -* [A snapshot issue prevents searchable snapshot indices from being allocated](../../../troubleshoot/monitoring/unavailable-shards.md#ec-searchable-snapshot-indices-not-allocated) -* [Maximum retry times exceeded](../../../troubleshoot/monitoring/unavailable-shards.md#ec-max-retry-exceeded) -* [Max shard per node reached the limit](../../../troubleshoot/monitoring/unavailable-shards.md#ec-max-shard-per-node) - -If your issue is not addressed here, then [contact Elastic support for help](../../../troubleshoot/index.md). - -### Disk is full [ec-disk-full] - -**Symptom** - -If the disk usage exceeded the threshold, you may get one or more of the following messages: - -`the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=90%], using more disk space than the maximum allowed [90.0%], actual free: [9.273781776428223%]` - -`unable to force allocate shard to [%s] during replacement, as allocating to this node would cause disk usage to exceed 100%% ([%s] bytes above available disk space)` - -`the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=85%], using more disk space than the maximum allowed [85.0%], actual free: [14.119771122932434%]` - -`after allocating [[restored-xxx][0], node[null], [P], recovery_source[snapshot recovery [Om66xSJqTw2raoNyKxsNWg] from xxx/W5Yea4QuR2yyZ4iM44fumg], s[UNASSIGNED], unassigned_info[[reason=NEW_INDEX_RESTORED], at[2022-03-02T10:56:58.210Z], delayed=false, details[restore_source[xxx]], allocation_status[fetching_shard_data]]] node [GTXrECDRRmGkkAnB48hPqw] would have more than the allowed 10% free disk threshold (8.7% free), preventing allocation` - -**Resolutions** - -Review the topic for your deployment architecture: - -* [Full disk on single-node deployment](../../../troubleshoot/monitoring/unavailable-nodes.md#ec-single-node-deployment-disk-used) -* [Full disk on multiple-nodes deployment](../../../troubleshoot/monitoring/unavailable-nodes.md#ec-multiple-node-deployment-disk-used) - -To learn more, review the following topics: - -* [Cluster-level shard allocation and routing settings](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html) -* [Fix watermark errors](../../../troubleshoot/elasticsearch/fix-watermark-errors.md) - - -### A node containing data has moved to a different host [ec-node-moved-to-another-host] - -**Symptom** - -During the routine system maintenance performed by Elastic, it might happen that a node moves to a different host. If the indices are not configured with replica shards, the shard data on the {{es}} node that is moved will be lost, and you might get one or more of these messages: - -`cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster` - -**Resolutions** - -Configure an [highly available cluster](../../../deploy-manage/production-guidance/plan-for-production-elastic-cloud.md) to keep your service running. Also, consider taking the following actions to bring your deployment back to health and recover your data from the snapshot. - -* [Close the red indices](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-close) -* [Restore the indices](../../../deploy-manage/tools/snapshot-and-restore.md) from the last successful snapshot - -For more information, check also [Designing for resilience](../../../deploy-manage/production-guidance/availability-and-resilience.md). - - -### Unable to assign shards based on the allocation rule [ec-cannot-assign-shards-on-allocation-rule] - -**Symptom** - -When shards cannot be assigned, due to [data tier allocation](../../../manage-data/lifecycle/data-tiers.md#data-tier-allocation) or [attribute-based allocation](../../../deploy-manage/distributed-architecture/shard-allocation-relocation-recovery/index-level-shard-allocation.md), you might get one or more of these messages: - -`node does not match index setting [index.routing.allocation.include] filters [node_type:\"cold\"]` - -`index has a preference for tiers [data_cold] and node does not meet the required [data_cold] tier` - -`index has a preference for tiers [data_cold,data_warm,data_hot] and node does not meet the required [data_cold] tier` - -`index has a preference for tiers [data_warm,data_hot] and node does not meet the required [data_warm] tier` - -`this node's data roles are exactly [data_frozen] so it may only hold shards from frozen searchable snapshots, but this index is not a frozen searchable snapshot` - -**Resolutions** - -* Make sure nodes are available in each data tier and have sufficient disk space. -* [Check the index settings](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-indices) and ensure shards can be allocated to the expected data tier. -* Check the [ILM policy](../../../manage-data/lifecycle/index-lifecycle-management.md) and check for issues with the [allocate action](https://www.elastic.co/guide/en/elasticsearch/reference/current/ilm-allocate.html). -* Inspect the [index templates](../../../manage-data/data-store/templates.md) and check for issues with the index settings. - - -### The number of eligible data nodes is less than the number of replicas [ec-eligible-data-nodes-less-than-replicas] - -**Symptom** - -Unassigned replica shards are often caused by there being fewer eligible data nodes than the configured number_of_replicas. - -**Resolutions** - -* Add more [eligible data nodes or more availability zones](../../../deploy-manage/deploy/elastic-cloud/ec-customize-deployment-components.md) to ensure resiliency. -* Adjust the `number_of_replicas` [setting](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-put-settings) for your indices to the number of eligible data nodes -1. - - -### A snapshot issue prevents searchable snapshot indices from being allocated [ec-searchable-snapshot-indices-not-allocated] - -**Symptom** - -Some snapshots operations might be impacted, as shown in the following example: - -`failed shard on node [Yc_Jbf73QVSVYSqZT8HPlA]: failed recovery, failure RecoveryFailedException[[restored-my_index-2021.32][1]: … SnapshotMissingException[[found-snapshots:2021.08.25-my_index-2021.32-default_policy-_j2k8it9qnehe1t-2k0u6a/iOAoyjWLTyytKkW3_wF1jw] is missing]; nested: NoSuchFileException[Blob object [snapshots/52bc3ae2030a4df8ab10559d1720a13c/indices/WRlkKDuPSLW__M56E8qbfA/1/snap-iOAoyjWLTyytKkW3_wF1jw.dat] not found: The specified key does not exist. (Service: Amazon S3; Status Code: 404; Error Code: NoSuchKey; Request ID: 4AMTM1XFMTV5F00V; S3 Extended Request ID:` - -**Resolutions** - -Upgrade to {{es}} version 7.17.0 or later, which resolves bugs that affected snapshot operations in earlier versions. Check [Upgrade versions](../../../deploy-manage/upgrade/deployment-or-cluster.md) for more details. - -If you can’t upgrade, you can recreate the snapshot repository as a workaround. - -The bugs also affect searchable snapshots. If you still have data in the cluster but cannot restore from the searchable snapshot, you can try reindexing and recreating the searchable snapshot: - -* Reindex all the affected indices to new regular indices -* Remove the affected frozen indices -* Take the snapshot and mount the indices again - - -### Max shard per node reached the limit [ec-max-shard-per-node] - -**Symptom** - -The parameter [`cluster.max_shards_per_node`](https://www.elastic.co/guide/en/elasticsearch/reference/current/misc-cluster-settings.html#cluster-max-shards-per-node) limits the total number of primary and replica shards for the cluster. If your cluster has a number of shards beyond this limit, you might get the following message: - -`Validation Failed: 1: this action would add [2] shards, but this cluster currently has [1000]/[1000] maximum normal shards open` - -**Resolutions** - -Delete unnecessary indices, add more data nodes, and [avoid oversharding](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md) as too many shards can overwhelm your cluster. If you cannot take these actions, and you’re confident your changes won’t destabilize the cluster, you can temporarily increase the limit using the [cluster update settings API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-put-settings) and retry the action. For more details, check [Troubleshoot shard-related errors](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md#troubleshoot-shard-related-errors). - - -### Maximum retry times exceeded [ec-max-retry-exceeded] - -**Symptom** - -The cluster will attempt to allocate a shard a few times, before giving up and leaving the shard unallocated. On {{es}} Service, `index.allocation.max_retries` defaults to 5. If allocation fails after the maximum number of retries, you might get the following message: - -`shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [%s]` - -**Resolutions** - -Run [`POST /_cluster/reroute?retry_failed=true`](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-reroute) API to retry. If it still fails, rerun the [Cluster allocation explain](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-allocation-explain) API to diagnose the problem. - - - diff --git a/raw-migrated-files/cloud/cloud/ec-scenario_why_is_my_node_unavailable.md b/raw-migrated-files/cloud/cloud/ec-scenario_why_is_my_node_unavailable.md deleted file mode 100644 index a58f9735ea..0000000000 --- a/raw-migrated-files/cloud/cloud/ec-scenario_why_is_my_node_unavailable.md +++ /dev/null @@ -1,309 +0,0 @@ -# Diagnose unavailable nodes [ec-scenario_why_is_my_node_unavailable] - -This section provides a list of common symptoms and possible actions that you can take to resolve issues when one or more nodes become unhealthy or unavailable. This guide is particularly useful if you are not [shipping your logs and metrics](../../../deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md) to a dedicated monitoring cluster. - -**What are the symptoms?** - -* [Full disk on single-node deployment](../../../troubleshoot/monitoring/unavailable-nodes.md#ec-single-node-deployment-disk-used) -* [Full disk on multiple-nodes deployment](../../../troubleshoot/monitoring/unavailable-nodes.md#ec-multiple-node-deployment-disk-used) -* [JVM heap usage exceeds the allowed threshold on master nodes](../../../troubleshoot/monitoring/unavailable-nodes.md#ec-jvm-heap-usage-exceed-allowed-threshold) -* [CPU usage exceeds the allowed threshold on master nodes](../../../troubleshoot/monitoring/unavailable-nodes.md#ec-cpu-usage-exceed-allowed-threshold) -* [Some nodes are unavailable and are displayed as missing](../../../troubleshoot/monitoring/unavailable-nodes.md#ec-nodes-unavailable-missing) - -**What is the impact?** - -* Only some search results are successful -* Ingesting, updating, and deleting data do not work -* Most {{es}} API requests fail - -::::{note} -Some actions described here, such as stopping indexing or Machine Learning jobs, are temporary remediations intended to get your cluster into a state where you can make configuration changes to resolve the issue. -:::: - - -For production deployments, we recommend setting up a dedicated monitoring cluster to collect metrics and logs, troubleshooting views, and cluster alerts. - -If your issue is not addressed here, then [contact Elastic support for help](../../../troubleshoot/index.md). - -## Full disk on single-node deployment [ec-single-node-deployment-disk-used] - -**Health check** - -1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the Elasticsearch Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. On your deployment page, scroll down to **Instances** and check if the disk allocation for your {{es}} instance is over 90%. - - :::{image} ../../../images/cloud-ec-full-disk-single-node.png - :alt: Full disk on single-node deployment - ::: - - -**Possible cause** - -* The available storage is insufficient for the amount of ingested data. - -**Resolution** - -* [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). -* Increase the disk size on your Hot data and Content tier (scale up). - -::::{note} -If your {{es}} cluster is unhealthy and reports a status of red, then increasing the disk size of your Hot data and Content tier may fail. You might need to delete some data so the configuration can be edited. If you want to increase your disk size without deleting data, then [reach out to Elastic support](../../../troubleshoot/index.md) and we will assist you with scaling up. -:::: - - -**Preventions** - -* Increase the disk size on your Hot data and Content tier (scale up). - - From your deployment menu, go to the **Edit** page and increase the **Size per zone** for your Hot data and Content tiers. - - :::{image} ../../../images/cloud-ec-increase-size-per-zone.png - :alt: Increase size per zone - ::: - -* Enable [autoscaling](../../../deploy-manage/autoscaling.md) to grow your cluster automatically when it runs out of space. -* Configure [ILM](../../../manage-data/lifecycle/index-lifecycle-management.md) policies to automatically delete unused data. -* Add nodes to your {{es}} cluster and enable [data tiers](../../../manage-data/lifecycle/data-tiers.md) to move older data that you don’t query often to more cost-effective storage. - - -## Full disk on multiple-nodes deployment [ec-multiple-node-deployment-disk-used] - -**Health check** - -1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. On your deployment page, scroll down to **Instances** and check if the disk allocation for any of your {{es}} instances is over 90%. - - :::{image} ../../../images/cloud-ec-full-disk-multiple-nodes.png - :alt: Full disk on multiple-nodes deployment - ::: - - -**Possible cause** - -* The available storage is insufficient for the amount of ingested data. - -**Resolution** - -* [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). -* Increase the disk size (scale up). - -::::{note} -If your {{es}} cluster is unhealthy and reports a status of red, the scale up configuration change to increasing disk size on the affected data tiers may fail. You might need to delete some data so the configuration can be edited. If you want to increase your disk size without deleting data, then [reach out to Elastic support](../../../troubleshoot/index.md) and we will assist you with scaling up. -:::: - - -**Preventions** - -* Increase the disk size (scale up). - - 1. On your deployment page, scroll down to **Instances** and identify the node attribute of the instances that are running out of disk space. - - :::{image} ../../../images/cloud-ec-node-attribute.png - :alt: Instance node attribute - ::: - - 2. Use the node types identified at step 1 to find out the corresponding data tier. - - :::{image} ../../../images/cloud-ec-node-types-data-tiers.png - :alt: Node type and corresponding attribute - ::: - - 3. From your deployment menu, go to the **Edit** page and increase the **Size per zone** for the data tiers identified at step 2. - - :::{image} ../../../images/cloud-ec-increase-size-per-zone.png - :alt: Increase size per zone - ::: - -* Enable [autoscaling](../../../deploy-manage/autoscaling.md) to grow your cluster automatically when it runs out of space. -* Configure [ILM](../../../manage-data/lifecycle/index-lifecycle-management.md) policies to automatically delete unused data. -* Enable [data tiers](../../../manage-data/lifecycle/data-tiers.md) to move older data that you don’t query often to more cost-effective storage. - - -## JVM heap usage exceeds the allowed threshold on master nodes [ec-jvm-heap-usage-exceed-allowed-threshold] - -**Health check** - -1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. On your deployment page, scroll down to **Instances** and check if the JVM memory pressure for your {{es}} instances is high. - - :::{image} ../../../images/cloud-ec-deployment-instances-config.png - :alt: Deployment instances configuration - ::: - - -**Possible causes** - -* The master node is overwhelmed by a large number of snapshots or shards. - - * External tasks initiated by clients - - * Index, search, update - * Frequent template updates due to the Beats configuration - - * Internal tasks initiated by users - - * Machine Learning jobs, watches, monitoring, ingest pipeline - - * Internal tasks initiated by {{es}} - - * Nodes joining and leaving due to hardware failures - * Shard allocation due to nodes joining and leaving - * Configuration of [ILM](../../../manage-data/lifecycle/index-lifecycle-management.md) policies. - - -**Resolutions** - -* If the master node is overwhelmed by external tasks initiated by clients: - - Investigate which clients might be overwhelming the cluster and reduce the request rate or pause ingesting, searching, or updating from the client. If you are using Beats, temporarily stop the Beat that’s overwhelming the cluster to avoid frequent template updates. - -* If the master node is overwhelmed by internal tasks initiated by users: - - * Check [cluster-level pending tasks](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-pending-tasks). - * Reduce the number of Machine Learning jobs or watches. - * Change the number of ingest pipelines or processors to use less memory. - -* If the master node is overwhelmed by internal tasks initiated by {{es}}: - - * For nodes joining and leaving, this should resolve itself. If increasing the master nodes size doesn’t resolve the issue, contact support. - * For shard allocation, inspect the progress of shards recovery. - - * Make sure `indices.recovery.max_concurrent_operations` is not aggressive, which could cause the master to be unavailable. - * Make sure `indices.recovery.max_bytes_per_sec` is set adequately to avoid impact on ingest and search workload. - - * Check [ILM](../../../manage-data/lifecycle/index-lifecycle-management.md) policies to avoid index rollover and relocate actions that are concurrent and aggressive. - -* If the master node is overwhelmed by a large number of snapshots, reduce the number of snapshots in the repo. -* If the master node is overwhelmed by a large number of shards, delete unneeded indices and shrink read-only indices to fewer shards. For more information, check [Reduce a cluster’s shard count](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md#reduce-cluster-shard-count). - - -## CPU usage exceeds the allowed threshold on master nodes [ec-cpu-usage-exceed-allowed-threshold] - -**Health check** - -By default, the allowed CPU usage threshold is set at 85%. - -1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). -2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. - - :::{image} ../../../images/cloud-ec-quick-link-to-deployment.png - :alt: Quick link to the deployment page - ::: - -3. Identify the IDs of your master nodes. On your deployment page, scroll down to **Instances** and filter your instance configuration by master. The IDs of your master nodes are in the title. In this example, the IDs are 21, 26 and 27: - - :::{image} ../../../images/cloud-ec-instances-filtered-by-master-id.png - :alt: Instances configuration filtered by master nodes ID - ::: - - ::::{note} - The name of the instance configuration might differ depending on the cloud provider. - :::: - -4. Navigate to the **Performance** page of your deployment. Check if the CPU usage of your master nodes exceeds 85%. Your master node has the format `instance-``, where ``` is the ID of the master node. - -If you use [Stack Monitoring](https://www.elastic.co/guide/en/kibana/current/xpack-monitoring.html), open Kibana from your deployment page and select **Stack Monitoring** from the menu or the search bar. - -::::{note} -Stack Monitoring comes with out-of-the-box rules, but you need to enable them when prompted. -:::: - - -**Possible causes** - -* The master node is overwhelmed by a large number of snapshots or shards. -* The memory available on the master node is overwhelmed by these tasks: - - * External tasks initiated by clients - - * Index, search, update - * Frequent template updates due to the Beats configuration - - * Internal tasks initiated by users - - * Machine Learning jobs, watches, monitoring, ingest pipelines - - * Internal tasks initiated by {{es}} - - * Nodes joining and leaving due to hardware failures - * Shard allocation due to nodes joining and leaving - * Configuration of [ILM](../../../manage-data/lifecycle/index-lifecycle-management.md) policies. - - -**Resolutions** - -* Navigate to the **Edit** page of your deployment and increase the master node size. -* [Upgrade the cluster](../../../deploy-manage/upgrade/deployment-or-cluster.md) to the latest version. -* If the master node is overwhelmed by external tasks initiated by clients: - - * Reduce the request rate or pause ingesting, searching, or updating from the client. - * Enable ingest and search-based autoscaling. - * Stop Beats to avoid frequent template updates. - -* If the master node is overwhelmed by internal tasks initiated by users: - - * Check [cluster-level pending tasks](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-pending-tasks). - * Reduce the number of Machine Learning jobs or watches. - * Change the number of ingest pipelines or processors to use less memory. - -* If the master node is overwhelmed by internal tasks initiated by {{es}}: - - * For nodes joining and leaving, this should resolve itself. If increasing the master nodes size doesn’t resolve the issue, contact support. - * For shard allocation, inspect the progress of shards recovery. If there’s no progress, contact support. - - * Make sure `indices.recovery.max_concurrent_operations` is not aggressive, which could cause the master to be unavailable. - * Make sure `indices.recovery.max_bytes_per_sec` is set adequately to avoid impact on ingest and search workload. - - * Check [ILM](../../../manage-data/lifecycle/index-lifecycle-management.md) policies to avoid index rollover and relocate actions that are concurrent and aggressive. - -* If the master node is overwhelmed by a large number of snapshots, reduce the number of snapshots in the repo. -* If the master node is overwhelmed by a large number of shards, reduce the number of shards on the node. For more information, check [Size your shards](../../../deploy-manage/production-guidance/optimize-performance/size-shards.md). - - -## Some nodes are unavailable and are displayed as missing [ec-nodes-unavailable-missing] - -**Health check** - -* Use the [Metrics inventory](https://www.elastic.co/guide/en/observability/current/monitor-infrastructure-and-hosts.html) to identify unavailable or unhealthy nodes. If the number of minimum master nodes is down, {{es}} is not available. - -**Possible causes** - -* Hardware issue. -* Routing has stopped because of a previous ES configuration failure. -* Disk/memory/CPU are saturated. -* The network is saturated or disconnected. -* Nodes are unable to join. - -**Resolutions** - -* Hardware issue: Any unhealthy hardware detected by the platform is automatically vacated within the hour. If this doesn’t happen, contact support. -* Routing stopped: A failed {{es}} configuration might stop the nodes routing. Restart the routing manually to bring the node back to health. -* Disk/memory/CPU saturated: - - * [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). - * Increase disk size. - * [Enable autoscaling](../../../deploy-manage/autoscaling.md). - * Configuration of [ILM](../../../manage-data/lifecycle/index-lifecycle-management.md) policies. - * [Manage data tiers](../../../manage-data/lifecycle/data-tiers.md). - -* Network saturated or disconnected: Contact support. -* Nodes unable to join: Fix the {{es}} configuration. -* Nodes unable to join: Contact support. diff --git a/raw-migrated-files/cloud/cloud/ec-scenario_why_is_performance_degrading_over_time.md b/raw-migrated-files/cloud/cloud/ec-scenario_why_is_performance_degrading_over_time.md deleted file mode 100644 index c4a839044f..0000000000 --- a/raw-migrated-files/cloud/cloud/ec-scenario_why_is_performance_degrading_over_time.md +++ /dev/null @@ -1,14 +0,0 @@ -# Why is performance degrading over time? [ec-scenario_why_is_performance_degrading_over_time] - -You have a smaller {{es}} cluster and you’ve noticed that performance seems to have declined recently. The response time during searches seems to have gone up, and overall the system just doesn’t seem to perform quite as well as it used to. You have already looked at the cluster performance metrics and have confirmed that both index and search response times have increased steadily and remained higher than before. So what explains the performance degradation? - -When you look in the **Cluster Performance Metrics** section of the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body), you get the following metrics: - -:::{image} ../../../images/cloud-metrics-credits.png -:alt: CPU usage versus CPU credits over time -::: - -Between just after 00:10 and 00:20, excessively high CPU usage consumes all CPU credits until no more credits are available. CPU credits enable boosting the assigned CPU resources temporarily to improve performance on smaller clusters up to and including 8 GB of RAM when it is needed most, but CPU credits are by their nature limited. You accumulate CPU credits when you use less than your assigned share of CPU resources, and you consume credits when you use more CPU resources than assigned. As you max out your CPU resources, CPU credits permit your cluster to consume more than 100% of the assigned resources temporarily, which explains why CPU usage exceeds 100%, with usage peaks that reach well over 400% for one node. As CPU credits are depleted, CPU usage gradually drops until it returns to 100% at 00:30 when no more CPU credits are available. You can also notice that after 00:30 credits gradually begin to accumulate again. - -If you need your cluster to be able to sustain a certain level of performance, you cannot rely on CPU boosting to handle the workload except temporarily. To ensure that performance can be sustained, consider increasing the size of your cluster. - diff --git a/raw-migrated-files/toc.yml b/raw-migrated-files/toc.yml index f8ddbc4fe7..b549997c2a 100644 --- a/raw-migrated-files/toc.yml +++ b/raw-migrated-files/toc.yml @@ -50,11 +50,9 @@ toc: - file: cloud/cloud-enterprise/ece-api-console.md - file: cloud/cloud-enterprise/ece-autoscaling.md - file: cloud/cloud-enterprise/ece-change-deployment.md - - file: cloud/cloud-enterprise/ece-config-change-errors.md - file: cloud/cloud-enterprise/ece-configuring-keystore.md - file: cloud/cloud-enterprise/ece-create-deployment.md - file: cloud/cloud-enterprise/ece-delete-deployment.md - - file: cloud/cloud-enterprise/ece-deployment-no-op.md - file: cloud/cloud-enterprise/ece-enable-auditing.md - file: cloud/cloud-enterprise/ece-find.md - file: cloud/cloud-enterprise/ece-generate-roles-token.md @@ -108,34 +106,25 @@ toc: - file: cloud/cloud-heroku/ech-add-user-settings.md - file: cloud/cloud-heroku/ech-adding-elastic-plugins.md - file: cloud/cloud-heroku/ech-adding-plugins.md - - file: cloud/cloud-heroku/ech-analyze_shards_with-api.md - - file: cloud/cloud-heroku/ech-analyze_shards_with-kibana.md - file: cloud/cloud-heroku/ech-autoscaling.md - - file: cloud/cloud-heroku/ech-config-change-errors.md - file: cloud/cloud-heroku/ech-configure-settings.md - file: cloud/cloud-heroku/ech-configure.md - file: cloud/cloud-heroku/ech-configuring-keystore.md - - file: cloud/cloud-heroku/ech-cpu-usage-exceed-allowed-threshold.md - file: cloud/cloud-heroku/ech-custom-bundles.md - file: cloud/cloud-heroku/ech-custom-repository.md - file: cloud/cloud-heroku/ech-delete-deployment.md - - file: cloud/cloud-heroku/ech-deployment-no-op.md - file: cloud/cloud-heroku/ech-editing-user-settings.md - file: cloud/cloud-heroku/ech-enable-kibana2.md - file: cloud/cloud-heroku/ech-enable-logging-and-monitoring.md - file: cloud/cloud-heroku/ech-getting-started.md - - file: cloud/cloud-heroku/ech-jvm-heap-usage-exceed-allowed-threshold.md - file: cloud/cloud-heroku/ech-manage-apm-settings.md - file: cloud/cloud-heroku/ech-manage-kibana-settings.md - file: cloud/cloud-heroku/ech-metrics-memory-pressure.md - file: cloud/cloud-heroku/ech-monitoring-setup.md - file: cloud/cloud-heroku/ech-monitoring.md - - file: cloud/cloud-heroku/ech-multiple-node-deployment-disk-used.md - - file: cloud/cloud-heroku/ech-nodes-unavailable-missing.md - file: cloud/cloud-heroku/ech-password-reset.md - file: cloud/cloud-heroku/ech-planning.md - file: cloud/cloud-heroku/ech-regional-deployment-aliases.md - - file: cloud/cloud-heroku/ech-remediate-issues-allocation-explain-API.md - file: cloud/cloud-heroku/ech-restoring-snapshots.md - file: cloud/cloud-heroku/ech-restrictions-monitoring.md - file: cloud/cloud-heroku/ech-saas-metrics-accessing.md @@ -144,7 +133,6 @@ toc: - file: cloud/cloud-heroku/ech-securing-clusters-JWT.md - file: cloud/cloud-heroku/ech-securing-clusters-SAML.md - file: cloud/cloud-heroku/ech-security.md - - file: cloud/cloud-heroku/ech-single-node-deployment-disk-used.md - file: cloud/cloud-heroku/ech-snapshot-restore.md - file: cloud/cloud-heroku/ech-traffic-filtering-deployment-configuration.md - file: cloud/cloud-heroku/ech-traffic-filtering-ip.md @@ -152,11 +140,6 @@ toc: - file: cloud/cloud-heroku/ech-traffic-filtering-vnet.md - file: cloud/cloud-heroku/ech-traffic-filtering-vpc.md - file: cloud/cloud-heroku/ech-upgrade-deployment.md - - file: cloud/cloud-heroku/echscenario_is_my_cluster_really_highly_available.md - - file: cloud/cloud-heroku/echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md - - file: cloud/cloud-heroku/echscenario_why_are_shards_unavailable.md - - file: cloud/cloud-heroku/echscenario_why_is_my_node_unavailable.md - - file: cloud/cloud-heroku/echscenario_why_is_performance_degrading_over_time.md - file: cloud/cloud-heroku/echsign-outgoing-saml-message.md - file: cloud/cloud/index.md children: @@ -171,14 +154,12 @@ toc: - file: cloud/cloud/ec-autoscaling.md - file: cloud/cloud/ec-billing-stop.md - file: cloud/cloud/ec-cloud-ingest-data.md - - file: cloud/cloud/ec-config-change-errors.md - file: cloud/cloud/ec-configure-index-management.md - file: cloud/cloud/ec-configuring-keystore.md - file: cloud/cloud/ec-custom-bundles.md - file: cloud/cloud/ec-custom-repository.md - file: cloud/cloud/ec-customize-deployment.md - file: cloud/cloud/ec-delete-deployment.md - - file: cloud/cloud/ec-deployment-no-op.md - file: cloud/cloud/ec-editing-user-settings.md - file: cloud/cloud/ec-enable-logging-and-monitoring.md - file: cloud/cloud/ec-faq-getting-started.md @@ -212,11 +193,6 @@ toc: - file: cloud/cloud/ec-restore-across-clusters.md - file: cloud/cloud/ec-restoring-snapshots.md - file: cloud/cloud/ec-saas-metrics-accessing.md - - file: cloud/cloud/ec-scenario_is_my_cluster_really_highly_available.md - - file: cloud/cloud/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md - - file: cloud/cloud/ec-scenario_why_are_shards_unavailable.md - - file: cloud/cloud/ec-scenario_why_is_my_node_unavailable.md - - file: cloud/cloud/ec-scenario_why_is_performance_degrading_over_time.md - file: cloud/cloud/ec-secure-clusters-kerberos.md - file: cloud/cloud/ec-secure-clusters-oidc.md - file: cloud/cloud/ec-securing-clusters-JWT.md diff --git a/troubleshoot/monitoring/cloud.md b/troubleshoot/monitoring/cloud.md index a0a6ea16d1..3f8c9ea237 100644 --- a/troubleshoot/monitoring/cloud.md +++ b/troubleshoot/monitoring/cloud.md @@ -5,6 +5,11 @@ mapped_pages: # Monitoring [ec-monitoring-diagnose-resolve] +Use the topics in this section to troubleshoot monitoring, including AutoOps. + +% TODO topic links + + diff --git a/troubleshoot/monitoring/cluster-response-time.md b/troubleshoot/monitoring/cluster-response-time.md index 8489aeb807..4c7235b2db 100644 --- a/troubleshoot/monitoring/cluster-response-time.md +++ b/troubleshoot/monitoring/cluster-response-time.md @@ -1,16 +1,28 @@ --- +navigation_title: "Cluster response time" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.html - https://www.elastic.co/guide/en/cloud-heroku/current/echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse.html --- -# Cluster response time +# Troubleshoot slow cluster response time [ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse] + +Your {{es}} cluster is humming along nicely with good performance until you suddenly notice that response times increase substantially, for both index response times and search response times. The cluster is slow to respond for about 10 minutes, after which performance returns to a normal level. + +Initially, you think that perhaps memory pressure is to blame, because you already know that [high memory pressure can cause performance issues](/troubleshoot/monitoring/high-memory-pressure.md). You look at the **Cluster Performance Metrics** section of the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body) and, after some zooming in to the right time frame, you get these metrics: + +:::{image} /images/cloud-metrics-response-times.png +:alt: Cluster performance metrics +::: + +Memory pressure is not the culprit. The **Memory Pressure per Node** metric is always well below 75%, and there is virtually no garbage collection overhead, which is consistent with low memory pressure. Similarly, CPU usage spiked and caused CPU boosting to kick in, but there were more than enough CPU credits to sustain the CPU usage spikes to over 300%. The cluster was not constrained by CPU resources, either. + +So what caused the sudden increase in response times? The key to the puzzle lies in the **Number of Requests** metric, which indicates the number of requests that a cluster receives per second. Beginning shortly before 13:32, there was a substantial increase in the number of user requests per second. The number of requests per second continued to rise until the requests began to plateau as your cluster reached its maximum throughput, which in turn caused response times to rise. The number of requests remained at a high level for approximately five minutes, until they started to drop off again around 13:40. Overall, the sustained increase of user requests lasted a bit over 10 minutes, consistent with the slowdown you observed. + +This cluster was sized to handle a certain number of user requests. As the user requests exceeded the maximum throughput that a cluster of this size could sustain, response times increased. To avoid such a slowdown, you either need to control the volume of user requests that reaches the {{es}} cluster or you need to size your cluster to be able to accommodate a sudden increase in user requests. -% What needs to be done: Lift-and-shift -% Use migrated content from existing pages that map to this page: -% - [ ] ./raw-migrated-files/cloud/cloud/ec-scenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md % Notes: Lift and shift this one % - [ ] ./raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_my_cluster_response_times_suddenly_so_much_worse.md % Notes: dupe, redirect \ No newline at end of file diff --git a/troubleshoot/monitoring/deployment-health-warnings.md b/troubleshoot/monitoring/deployment-health-warnings.md index d0d671fbfa..6eef3c420c 100644 --- a/troubleshoot/monitoring/deployment-health-warnings.md +++ b/troubleshoot/monitoring/deployment-health-warnings.md @@ -1,19 +1,29 @@ --- +navigation_title: "Deployment health warnings" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-deployment-no-op.html - https://www.elastic.co/guide/en/cloud-enterprise/current/ece-deployment-no-op.html - https://www.elastic.co/guide/en/cloud-heroku/current/ech-deployment-no-op.html --- -# Deployment health warnings +# Troubleshoot deployment health warnings [ec-deployment-no-op] -% What needs to be done: Lift-and-shift +The Elasticsearch Service [Deployments](https://cloud.elastic.co/deployments) page shows the current status of your active deployments. From time to time you may get one or more health warnings, such as the following: -% Use migrated content from existing pages that map to this page: +:::{image} /images/cloud-ec-ce-deployment-health-warning.png +:alt: A screen capture of the deployment page showing a typical warning: Deployment health warning: Latest change to {{es}} configuration failed. +::: -% - [ ] ./raw-migrated-files/cloud/cloud/ec-deployment-no-op.md -% Notes: Lift and shift this one -% - [ ] ./raw-migrated-files/cloud/cloud-enterprise/ece-deployment-no-op.md -% Notes: dupe, redirect -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-deployment-no-op.md -% Notes: redirect +**Seeing only one warning?** + +To resolve a single health warning, we recommended first re-applying any pending changes: Select **Edit** in the deployment menu to open the Edit page and then click **Save** without making any changes. This will check all components for pending changes and will apply the changes as needed. This may impact the uptime of clusters which are not [highly available](/deploy-manage/production-guidance/plan-for-production-elastic-cloud.md#ec-ha). + +Re-saving the deployment configuration without making any changes is often all that’s needed to resolve a transient health warning on the UI. Saving will redirect you to the Elasticsearch Service deployment [Activity page](/deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) where you can monitor plan completion. Repeat errors should be investigated; for more information refer to [resolving configuration change errors](/troubleshoot/monitoring/node-bootlooping.md). + +**Seeing multiple warnings?** + +If multiple health warnings appear for one of your deployments, or if your deployment is unhealthy, we recommend [Getting help](/troubleshoot/index.md) through the Elastic Support Portal. + +**Warning about system changes** + +If the warning refers to a system change, check the deployment’s [Activity](/deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) page. diff --git a/troubleshoot/monitoring/high-availability.md b/troubleshoot/monitoring/high-availability.md index a0f7257d69..081e2eaff1 100644 --- a/troubleshoot/monitoring/high-availability.md +++ b/troubleshoot/monitoring/high-availability.md @@ -4,13 +4,20 @@ mapped_pages: - https://www.elastic.co/guide/en/cloud-heroku/current/echscenario_is_my_cluster_really_highly_available.html --- -# High availability +# Cluster performance metrics [ec-scenario_is_my_cluster_really_highly_available] -% What needs to be done: Lift-and-shift +% TODO: Edit edit edit -% Use migrated content from existing pages that map to this page: +You created a new cluster in Elasticsearch Service that uses three availability zones and index replicas, because you want to use the [cluster for production](/deploy-manage/production-guidance/plan-for-production-elastic-cloud.md#ec-ha). It’s a mission-critical deployment and you need it to be able to handle user requests at all times. Your cluster has been up and running for some time and it seems to handle its workload well. But is this cluster really highly available, given its current workload? -% - [ ] ./raw-migrated-files/cloud/cloud/ec-scenario_is_my_cluster_really_highly_available.md -% Notes: Lift and shift this one -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/echscenario_is_my_cluster_really_highly_available.md -% Notes: dupe, redirect \ No newline at end of file +To answer this question, let’s take a look at CPU usage in the **Cluster Performance Metrics** section in the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body): + +:::{image} /images/cloud-metrics-cpu.png +:alt: CPU usage over time +::: + +Cluster performance metrics are shown per node and are color-coded to indicate which running {{es}} instance they belong to. In this case, you can notice that, from about 22:05 until just before 22:30, two out of three nodes are consistently close to maxing out their CPU resources at 100%. The third node seems to average somewhere under the 50% mark most of the time. + +This CPU usage graph indicates that your cluster is load-balancing between the nodes in the different availability zones as designed, but the workload is too high to be able to handle the loss of an availability zone. For a cluster to be able to handle the failure of a node, it should be considered at capacity when it uses 50% of its resources. In this case, two of the nodes are already maxed out and the third one is around 50%. If any one of the three nodes were to fail, the volume of user requests would overwhelm the remaining nodes. On smaller clusters up to and including 8 GB of RAM, CPU boosting can temporarily relieve some of the pressure, but you should not rely on this feature for high availability. On larger clusters, CPU boosting is not available. + +Even if your cluster is performing well, you still need to make sure that there is sufficient spare capacity to deal with the outage of an entire availability zone. For this cluster to remain highly available at all times, you either need to increase its size or reduce its workload. diff --git a/troubleshoot/monitoring/high-memory-pressure.md b/troubleshoot/monitoring/high-memory-pressure.md index 98d091d099..b16db2d1ce 100644 --- a/troubleshoot/monitoring/high-memory-pressure.md +++ b/troubleshoot/monitoring/high-memory-pressure.md @@ -1,16 +1,55 @@ --- +navigation_title: "High memory pressure" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-metrics-memory-pressure.html - https://www.elastic.co/guide/en/cloud-heroku/current/ech-metrics-memory-pressure.html --- -# High memory pressure +# Troubleshoot high memory pressure -% What needs to be done: Lift-and-shift +When you load up an {{es}} cluster with an indexing and search workload that matches the size of the cluster well, you typically get the classic JVM heap sawtooth pattern as memory gets used and then gets freed up again by the garbage collector. Memory usage increases until it reaches 75% and then drops again as memory is freed up: -% Use migrated content from existing pages that map to this page: +:::{image} /images/cloud-metrics-memory-pressure-sawtooth.png +:alt: The classic JVM sawtooth pattern that shows memory usage +::: -% - [ ] ./raw-migrated-files/cloud/cloud/ec-metrics-memory-pressure.md -% Notes: Lift and shift this one -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-metrics-memory-pressure.md -% Notes: dupe, redirect \ No newline at end of file +Now let’s suppose you have a cluster with three nodes and much higher memory pressure overall. In this example, two of the three nodes are maxing out very regularly for extended periods and one node is consistently hovering around the 75% mark. + +:::{image} /images/cloud-metrics-high-memory-pressure.png +:alt: High memory pressure +::: + +High memory pressure works against cluster performance in two ways: As memory pressure rises to 75% and above, less memory remains available, but your cluster now also needs to spend some CPU resources to reclaim memory through garbage collection. These CPU resources are not available to handle user requests while garbage collection is going on. As a result, response times for user requests increases as the system becomes more and more resource constrained. If memory pressure continues to rise and reaches near 100%, a much more aggressive form of garbage collection is used, which will in turn affect cluster response times dramatically. + +:::{image} /images/cloud-metrics-high-response-times.png +:alt: High response times +::: + +In our example, the **Index Response Times** metric shows that high memory pressure leads to a significant performance impact. As two of the three nodes max out their memory several times and plateau at 100% memory pressure for 30 to 45 minutes at a time, there is a sharp increase in the index response times around 23:00, 00:00, and 01:00. Search response times, which are not shown, also increase but not as dramatically. Only the node in blue that consistently shows a much healthier memory pressure that rarely exceeds 75% can sustain a lower response time. + +If the performance impact from high memory pressure is not acceptable, you need to increase the cluster size or reduce the workload. + + +## Increase the deployment size [ec_increase_the_deployment_size] + +Scaling with Elasticsearch Service is easy: simply log in to the Elasticsearch Service console, select your deployment, select edit, and either increase the number of zones or the size per zone. + + +## Reduce the workload [ec_reduce_the_workload] + +By understanding and adjusting the way your data is indexed, retained, and searched you can reduce the amount of memory used and increase performance. + + +### Sharding strategy [ec_sharding_strategy] + +{{es}} indices are divided into shards. Understanding shards is important when tuning {{es}}. Check [Size your shards](https://www.elastic.co/guide/en/elasticsearch/reference/current/size-your-shards.html) in the {{es}} documentation to learn more. + + +### Data retention [ec_data_retention] + +The total amount of data being searched affects search performance. Check the tutorial [Automate rollover with index lifecycle management](https://www.elastic.co/guide/en/elasticsearch/reference/current/getting-started-index-lifecycle-management.html) (ILM) to automate data retention policies. + + +### Tune for search speed [ec_tune_for_search_speed] + +The documentation [Tune for search speed](https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-search-speed.html) provides details on how to analyze queries, optimize field types, minimize the fields searched, and more. diff --git a/troubleshoot/monitoring/node-bootlooping.md b/troubleshoot/monitoring/node-bootlooping.md index ab21f7f751..36f42e10b3 100644 --- a/troubleshoot/monitoring/node-bootlooping.md +++ b/troubleshoot/monitoring/node-bootlooping.md @@ -1,51 +1,146 @@ --- +navigation_title: "Node bootlooping" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-config-change-errors.html - https://www.elastic.co/guide/en/cloud-enterprise/current/ece-config-change-errors.html - https://www.elastic.co/guide/en/cloud-heroku/current/ech-config-change-errors.html --- -# Node bootlooping +# Troubleshoot node bootlooping [ec-config-change-errors] -% What needs to be done: Lift-and-shift +When you attempt to apply a configuration change to a deployment, the attempt may fail with an error indicating that the change could not be applied, and deployment resources may be unable to restart. In some cases, bootlooping may result, where the deployment resources cycle through a continual reboot process. -% Use migrated content from existing pages that map to this page: +:::{image} /images/cloud-ec-ce-configuration-change-failure.png +:alt: A screen capture of the deployment page showing an error: Latest change to {{es}} configuration failed. +::: -% - [ ] ./raw-migrated-files/cloud/cloud/ec-config-change-errors.md -% Notes: Lift and shift this one -% - [ ] ./raw-migrated-files/cloud/cloud-enterprise/ece-config-change-errors.md -% Notes: redirect -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-config-change-errors.md -% Notes: dupe, redirect +To help diagnose these and any other types of issues in your deployments, we recommend [setting up monitoring](/deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md). Then, you can easily view your deployment health and access log files to troubleshoot this configuration failure. -% Internal links rely on the following IDs being on this page (e.g. as a heading ID, paragraph ID, etc): +To confirm if your Elasticsearch cluster is bootlooping, you can check the most recent plan under your [Deployment Activity page](/deploy-manage/deploy/elastic-cloud/keep-track-of-deployment-activity.md) for the error: -$$$ech-config-change-errors-secure-settings$$$ +```sh +Plan change failed: Some instances were unable to start properly. +``` -$$$ech-config-change-errors-expired-bundle-extension$$$ +If this occurs, correlating {{es}} logs should report: -$$$ech-config-change-errors-oom-errors$$$ +```sh +fatal exception while booting Elasticsearch +``` -$$$ech-config-change-errors-existing-index$$$ +Following are some frequent causes of a failed configuration change: -$$$ech-config-change-errors-insufficient-storage$$$ +1. [Secure settings](/troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-secure-settings) +2. [Expired custom plugins or bundles](/troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-expired-bundle-extension) +3. [OOM errors](/troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-oom-errors) +4. [Existing index](/troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-existing-index) +5. [Insufficient Storage](/troubleshoot/monitoring/node-bootlooping.md#ec-config-change-errors-insufficient-storage) -$$$ece-config-change-errors-secure-settings$$$ +If you’re unable to remediate the failing plan’s root cause, you can attempt to reset the deployment to the latest successful {{es}} configuration by performing a [no-op plan](/troubleshoot/monitoring/deployment-health-warnings.md). For an example, see this [video walkthrough](https://www.youtube.com/watch?v=8MnXZ9egBbQ). -$$$ece-config-change-errors-expired-bundle-extension$$$ -$$$ece-config-change-errors-oom-errors$$$ +## Secure settings [ec-config-change-errors-secure-settings] -$$$ece-config-change-errors-existing-index$$$ +The most frequent cause of a failed deployment configuration change is due to invalid or mislocated [secure settings](/deploy-manage/security/secure-settings.md). This can frequently be discovered by searching {{es}} logs for one of the following error messages: -$$$ece-config-change-errors-insufficient-storage$$$ +```sh +IllegalStateException: security initialization failed +java.lang.IllegalArgumentException: unknown secure setting +``` -$$$ec-config-change-errors-secure-settings$$$ +These are settings typically added to the keystore for the purpose of: -$$$ec-config-change-errors-expired-bundle-extension$$$ +1. Setting up third-party authentication, for example [SAML](/deploy-manage/users-roles/cluster-or-deployment-auth/saml.md), [OpenID Connect](/deploy-manage/users-roles/cluster-or-deployment-auth/openid-connect.md), or [Kerberos](/deploy-manage/users-roles/cluster-or-deployment-auth/kerberos.md). +2. Setting up a [custom repository](/deploy-manage/tools/snapshot-and-restore/elastic-cloud-hosted.md). -$$$ec-config-change-errors-oom-errors$$$ +The keystore allows you to safely store sensitive settings, such as passwords, as a key/value pair. You can then access a secret value from a settings file by referencing its key. Importantly, not all settings can be stored in the keystore, and the keystore does not validate the settings that you add. Adding unsupported settings can cause {{es}} or other components to fail to restart. To check whether a setting is supported in the keystore, look for a "Secure" qualifier in the [lists of reloadable settings](/deploy-manage/security/secure-settings.md). -$$$ec-config-change-errors-existing-index$$$ +The following sections detail some secure settings problems that can result in a configuration change error that can prevent a deployment from restarting. You might diagnose these plan failures via the logs or via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `1`, `3`, and `78`. -$$$ec-config-change-errors-insufficient-storage$$$ + +### Invalid or outdated values [ec-config-change-errors-old-values] + +The keystore does not validate any settings that you add, so invalid or outdated values are a common source of errors when you apply a configuration change to a deployment. + +To check the current set of stored settings: + +1. Open the deployment **Security** page. +2. In the **{{es}} keystore** section, check the **Security keys** list. The list is shown only if you currently have settings configured in the keystore. + +One frequent cause of errors is when settings in the keystore are no longer valid, such as when SAML settings are added for a test environment, but the settings are either not carried over or no longer valid in a production environment. + + +### Snapshot repositories [ec-config-change-errors-snapshot-repos] + +Sometimes, settings added to the keystore to connect to a snapshot repository may not be valid. When this happens, you may get an error such as `SettingsException[Neither a secret key nor a shared access token was set.]` + +For example, when adding an [Azure repository storage setting](/deploy-manage/tools/snapshot-and-restore/azure-repository.md#repository-azure-usage) such as `azure.client.default.account` to the keystore, the associated setting `azure.client.default.key` must also be added for the configuration to be valid. + + +### Third-party authentication [ec-config-change-errors-third-party-auth] + +When you configure third-party authentication, it’s important that all required configuration elements that are stored in the keystore are included in the {{es}} user settings file. For example, when you [create a SAML realm](/deploy-manage/users-roles/cluster-or-deployment-auth/saml.md#saml-create-realm), omitting a field such as `idp.entity_id` when that setting is present in the keystore results in a failed configuration change. + + +### Wrong location [ec-config-change-errors-wrong-location] + +In some cases, settings may accidentally be added to the keystore that should have been added to the [{{es}} user settings file](/deploy-manage/deploy/elastic-cloud/edit-stack-settings.md). It’s always a good idea to check the [lists of reloadable settings](/deploy-manage/security/secure-settings.md) to determine if a setting can be stored in the keystore. Settings that can safely be added to the keystore are flagged as `Secure`. + + +## Expired custom plugins or bundles [ec-config-change-errors-expired-bundle-extension] + +During the process of applying a configuration change, Elasticsearch Service checks to determine if any [uploaded custom plugins or bundles](/deploy-manage/deploy/elastic-cloud/upload-custom-plugins-bundles.md) are expired. + +Problematic plugins produce oscillating {{es}} start-up logs like the following: + +```sh +Booting at Sun Sep 4 03:06:43 UTC 2022 +Installing user plugins. +Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... +/app/elasticsearch.sh: line 169: [: too many arguments +Booting at Sun Sep 4 03:06:58 UTC 2022 +Installing user plugins. +Installing elasticsearch-analysis-izumo-master-7.10.2-20210618-28f8a97... +/app/elasticsearch.sh: line 169: [: too many arguments +``` + +Problematic bundles produce similar oscillations but their install log would appear like + +```sh +2024-11-17 15:18:02 https://found-user-plugins.s3.amazonaws.com/XXXXX/XXXXX.zip?response-content-disposition=attachment%3Bfilename%XXXXX%2F4007535947.zip&x-elastic-extension-version=1574194077471&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20241016T133214Z&X-Amz-SignedHeaders=host&X-Amz-Expires=86400&XAmz-Credential=XXXXX%2F20201016%2Fus-east-1%2Fs3%2Faws4_request&X-AmzSignature=XXXXX +``` + +Noting in example that the bundle’s expiration `X-Amz-Date=20241016T133214Z` is before than the log timestamp `2024-11-17 15:18:02` so this bundle is considered expired. + +To view any added plugins or bundles: + +1. Go to the **Features** page and open the **Extensions** tab. +2. Select any extension and then choose **Update extension** to renew it. No other changes are needed, and any associated configuration change failures should now be able to succeed. + + +## OOM errors [ec-config-change-errors-oom-errors] + +Configuration change errors can occur when there is insufficient RAM configured for a data tier. In this case, the cluster typically also shows OOM (out of memory) errors. To resolve these, you need to increase the amount of heap memory, which is half of the amount of memory allocated to a cluster. You might also detect OOM in plan changes via their [related exit codes](https://www.elastic.co/guide/en/elasticsearch/reference/current/stopping-elasticsearch.html#fatal-errors) `127`, `137`, and `158`. + +Check the [{{es}} cluster size](/deploy-manage/deploy/elastic-cloud/ec-customize-deployment-components.md#ec-cluster-size) and the [JVM memory pressure indicator](/deploy-manage/monitor/monitoring-data/ec-memory-pressure.md) documentation to learn more. + +As well, you can read our detailed blog [Managing and troubleshooting {{es}} memory](https://www.elastic.co/blog/managing-and-troubleshooting-elasticsearch-memory). + + +## Existing index [ec-config-change-errors-existing-index] + +In rare cases, when you attempt to upgrade the version of a deployment and the upgrade fails on the first attempt, subsequent attempts to upgrade may fail due to already existing resources. The problem may be due to the system preventing itself from overwriting existing indices, resulting in an error such as this: `Another Kibana instance appears to be migrating the index. Waiting for that migration to complete. If no other Kibana instance is attempting migrations, you can get past this message by deleting index .kibana_2 and restarting Kibana`. + +To resolve this: + +1. Check that you don’t need the content. +2. Run an {{es}} [Delete index request](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete) to remove the existing index. + + In this example, the `.kibana_2` index is the rollover of saved objects (such as Kibana visualizations or dashboards) from the original `.kibana_1` index. Since `.kibana_2` was created as part of the failed upgrade process, this index does not yet contain any pertinent data and it can safely be deleted. + +3. Retry the deployment configuration change. + + +## Insufficient Storage [ec-config-change-errors-insufficient-storage] + +Configuration change errors can occur when there is insufficient disk space for a data tier. To resolve this, you need to increase the size of that tier to ensure it provides enough storage to accommodate the data in your cluster tier considering the [high watermark](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html#disk-based-shard-allocation). For troubleshooting walkthrough, see [Fix watermark errors](https://www.elastic.co/guide/en/elasticsearch/reference/current/fix-watermark-errors.html). \ No newline at end of file diff --git a/troubleshoot/monitoring/node-moves-outages.md b/troubleshoot/monitoring/node-moves-outages.md index 1c1c250e5a..04f45c2de9 100644 --- a/troubleshoot/monitoring/node-moves-outages.md +++ b/troubleshoot/monitoring/node-moves-outages.md @@ -1,4 +1,5 @@ --- +navigation_title: "Troubleshoot node moves and outages" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-deployment-node-move.html --- diff --git a/troubleshoot/monitoring/performance.md b/troubleshoot/monitoring/performance.md index 8bf61a0a61..dcd472636a 100644 --- a/troubleshoot/monitoring/performance.md +++ b/troubleshoot/monitoring/performance.md @@ -1,16 +1,20 @@ --- +navigation_title: "Performance" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-scenario_why_is_performance_degrading_over_time.html - https://www.elastic.co/guide/en/cloud-heroku/current/echscenario_why_is_performance_degrading_over_time.html --- -# Performance +# Troubleshoot performance degrading over time[ec-scenario_why_is_performance_degrading_over_time] -% What needs to be done: Lift-and-shift +You have a smaller {{es}} cluster and you’ve noticed that performance seems to have declined recently. The response time during searches seems to have gone up, and overall the system just doesn’t seem to perform quite as well as it used to. You have already looked at the cluster performance metrics and have confirmed that both index and search response times have increased steadily and remained higher than before. So what explains the performance degradation? -% Use migrated content from existing pages that map to this page: +When you look in the **Cluster Performance Metrics** section of the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body), you get the following metrics: -% - [ ] ./raw-migrated-files/cloud/cloud/ec-scenario_why_is_performance_degrading_over_time.md -% Notes: - Lift and shift the first one (cloud) only -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_performance_degrading_over_time.md -% Notes: dupe, redirect \ No newline at end of file +:::{image} ../../../images/cloud-metrics-credits.png +:alt: CPU usage versus CPU credits over time +::: + +Between just after 00:10 and 00:20, excessively high CPU usage consumes all CPU credits until no more credits are available. CPU credits enable boosting the assigned CPU resources temporarily to improve performance on smaller clusters up to and including 8 GB of RAM when it is needed most, but CPU credits are by their nature limited. You accumulate CPU credits when you use less than your assigned share of CPU resources, and you consume credits when you use more CPU resources than assigned. As you max out your CPU resources, CPU credits permit your cluster to consume more than 100% of the assigned resources temporarily, which explains why CPU usage exceeds 100%, with usage peaks that reach well over 400% for one node. As CPU credits are depleted, CPU usage gradually drops until it returns to 100% at 00:30 when no more CPU credits are available. You can also notice that after 00:30 credits gradually begin to accumulate again. + +If you need your cluster to be able to sustain a certain level of performance, you cannot rely on CPU boosting to handle the workload except temporarily. To ensure that performance can be sustained, consider increasing the size of your cluster. diff --git a/troubleshoot/monitoring/unavailable-nodes.md b/troubleshoot/monitoring/unavailable-nodes.md index 29b1226be8..9631965208 100644 --- a/troubleshoot/monitoring/unavailable-nodes.md +++ b/troubleshoot/monitoring/unavailable-nodes.md @@ -1,4 +1,7 @@ --- +navigation_title: "Unavailable nodes" +applies: + hosted: mapped_urls: - https://www.elastic.co/guide/en/cloud/current/ec-scenario_why_is_my_node_unavailable.html - https://www.elastic.co/guide/en/cloud-heroku/current/echscenario_why_is_my_node_unavailable.html @@ -9,25 +12,320 @@ mapped_urls: - https://www.elastic.co/guide/en/cloud-heroku/current/ech-nodes-unavailable-missing.html --- -# Unavailable nodes +% TODO fix the layout and formatting + +# Diagnose unavailable nodes [ec-scenario_why_is_my_node_unavailable] + +This section provides a list of common symptoms and possible actions that you can take to resolve issues when one or more nodes become unhealthy or unavailable. This guide is particularly useful if you are not [shipping your logs and metrics](/deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md) to a dedicated monitoring cluster. + +**What are the symptoms?** + +* [Full disk on single-node deployment](/troubleshoot/monitoring/unavailable-nodes.md#ec-single-node-deployment-disk-used) +* [Full disk on multiple-nodes deployment](/troubleshoot/monitoring/unavailable-nodes.md#ec-multiple-node-deployment-disk-used) +* [JVM heap usage exceeds the allowed threshold on master nodes](/troubleshoot/monitoring/unavailable-nodes.md#ec-jvm-heap-usage-exceed-allowed-threshold) +* [CPU usage exceeds the allowed threshold on master nodes](/troubleshoot/monitoring/unavailable-nodes.md#ec-cpu-usage-exceed-allowed-threshold) +* [Some nodes are unavailable and are displayed as missing](/troubleshoot/monitoring/unavailable-nodes.md#ec-nodes-unavailable-missing) + +**What is the impact?** + +* Only some search results are successful +* Ingesting, updating, and deleting data do not work +* Most {{es}} API requests fail + +::::{note} +Some actions described here, such as stopping indexing or Machine Learning jobs, are temporary remediations intended to get your cluster into a state where you can make configuration changes to resolve the issue. +:::: + + +For production deployments, we recommend setting up a dedicated monitoring cluster to collect metrics and logs, troubleshooting views, and cluster alerts. + +If your issue is not addressed here, then [contact Elastic support for help](/troubleshoot/index.md). + +## Full disk on single-node deployment [ec-single-node-deployment-disk-used] + +**Health check** + +1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). +2. From the Elasticsearch Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. + + :::{image} /images/cloud-ec-quick-link-to-deployment.png + :alt: Quick link to the deployment page + ::: + +3. On your deployment page, scroll down to **Instances** and check if the disk allocation for your {{es}} instance is over 90%. + + :::{image} /images/cloud-ec-full-disk-single-node.png + :alt: Full disk on single-node deployment + ::: + + +**Possible cause** + +* The available storage is insufficient for the amount of ingested data. + +**Resolution** + +* [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). +* Increase the disk size on your Hot data and Content tier (scale up). + +::::{note} +If your {{es}} cluster is unhealthy and reports a status of red, then increasing the disk size of your Hot data and Content tier may fail. You might need to delete some data so the configuration can be edited. If you want to increase your disk size without deleting data, then [reach out to Elastic support](/troubleshoot/index.md) and we will assist you with scaling up. +:::: + + +**Preventions** + +* Increase the disk size on your Hot data and Content tier (scale up). + + From your deployment menu, go to the **Edit** page and increase the **Size per zone** for your Hot data and Content tiers. + + :::{image} /images/cloud-ec-increase-size-per-zone.png + :alt: Increase size per zone + ::: + +* Enable [autoscaling](/deploy-manage/autoscaling.md) to grow your cluster automatically when it runs out of space. +* Configure [ILM](/manage-data/lifecycle/index-lifecycle-management.md) policies to automatically delete unused data. +* Add nodes to your {{es}} cluster and enable [data tiers](/manage-data/lifecycle/data-tiers.md) to move older data that you don’t query often to more cost-effective storage. + + +## Full disk on multiple-nodes deployment [ec-multiple-node-deployment-disk-used] + +**Health check** + +1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). +2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. + + :::{image} /images/cloud-ec-quick-link-to-deployment.png + :alt: Quick link to the deployment page + ::: + +3. On your deployment page, scroll down to **Instances** and check if the disk allocation for any of your {{es}} instances is over 90%. + + :::{image} /images/cloud-ec-full-disk-multiple-nodes.png + :alt: Full disk on multiple-nodes deployment + ::: + + +**Possible cause** + +* The available storage is insufficient for the amount of ingested data. + +**Resolution** + +* [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). +* Increase the disk size (scale up). + +::::{note} +If your {{es}} cluster is unhealthy and reports a status of red, the scale up configuration change to increasing disk size on the affected data tiers may fail. You might need to delete some data so the configuration can be edited. If you want to increase your disk size without deleting data, then [reach out to Elastic support](/troubleshoot/index.md) and we will assist you with scaling up. +:::: + + +**Preventions** + +* Increase the disk size (scale up). + + 1. On your deployment page, scroll down to **Instances** and identify the node attribute of the instances that are running out of disk space. + + :::{image} /images/cloud-ec-node-attribute.png + :alt: Instance node attribute + ::: + + 2. Use the node types identified at step 1 to find out the corresponding data tier. + + :::{image} /images/cloud-ec-node-types-data-tiers.png + :alt: Node type and corresponding attribute + ::: + + 3. From your deployment menu, go to the **Edit** page and increase the **Size per zone** for the data tiers identified at step 2. + + :::{image} /images/cloud-ec-increase-size-per-zone.png + :alt: Increase size per zone + ::: + +* Enable [autoscaling](/deploy-manage/autoscaling.md) to grow your cluster automatically when it runs out of space. +* Configure [ILM](/manage-data/lifecycle/index-lifecycle-management.md) policies to automatically delete unused data. +* Enable [data tiers](/manage-data/lifecycle/data-tiers.md) to move older data that you don’t query often to more cost-effective storage. + + +## JVM heap usage exceeds the allowed threshold on master nodes [ec-jvm-heap-usage-exceed-allowed-threshold] + +**Health check** + +1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). +2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. + + :::{image} /images/cloud-ec-quick-link-to-deployment.png + :alt: Quick link to the deployment page + ::: + +3. On your deployment page, scroll down to **Instances** and check if the JVM memory pressure for your {{es}} instances is high. + + :::{image} /images/cloud-ec-deployment-instances-config.png + :alt: Deployment instances configuration + ::: + + +**Possible causes** + +* The master node is overwhelmed by a large number of snapshots or shards. + + * External tasks initiated by clients + + * Index, search, update + * Frequent template updates due to the Beats configuration + + * Internal tasks initiated by users + + * Machine Learning jobs, watches, monitoring, ingest pipeline + + * Internal tasks initiated by {{es}} + + * Nodes joining and leaving due to hardware failures + * Shard allocation due to nodes joining and leaving + * Configuration of [ILM](/manage-data/lifecycle/index-lifecycle-management.md) policies. + + +**Resolutions** + +* If the master node is overwhelmed by external tasks initiated by clients: + + Investigate which clients might be overwhelming the cluster and reduce the request rate or pause ingesting, searching, or updating from the client. If you are using Beats, temporarily stop the Beat that’s overwhelming the cluster to avoid frequent template updates. + +* If the master node is overwhelmed by internal tasks initiated by users: + + * Check [cluster-level pending tasks](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-pending-tasks). + * Reduce the number of Machine Learning jobs or watches. + * Change the number of ingest pipelines or processors to use less memory. + +* If the master node is overwhelmed by internal tasks initiated by {{es}}: + + * For nodes joining and leaving, this should resolve itself. If increasing the master nodes size doesn’t resolve the issue, contact support. + * For shard allocation, inspect the progress of shards recovery. + + * Make sure `indices.recovery.max_concurrent_operations` is not aggressive, which could cause the master to be unavailable. + * Make sure `indices.recovery.max_bytes_per_sec` is set adequately to avoid impact on ingest and search workload. + + * Check [ILM](/manage-data/lifecycle/index-lifecycle-management.md) policies to avoid index rollover and relocate actions that are concurrent and aggressive. + +* If the master node is overwhelmed by a large number of snapshots, reduce the number of snapshots in the repo. +* If the master node is overwhelmed by a large number of shards, delete unneeded indices and shrink read-only indices to fewer shards. For more information, check [Reduce a cluster’s shard count](/deploy-manage/production-guidance/optimize-performance/size-shards.md#reduce-cluster-shard-count). + + +## CPU usage exceeds the allowed threshold on master nodes [ec-cpu-usage-exceed-allowed-threshold] + +**Health check** + +By default, the allowed CPU usage threshold is set at 85%. + +1. Log in to the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body). +2. From the {{es}} Service panel, click the **Quick link** icon corresponding to the deployment that you want to manage. + + :::{image} /images/cloud-ec-quick-link-to-deployment.png + :alt: Quick link to the deployment page + ::: + +3. Identify the IDs of your master nodes. On your deployment page, scroll down to **Instances** and filter your instance configuration by master. The IDs of your master nodes are in the title. In this example, the IDs are 21, 26 and 27: + + :::{image} /images/cloud-ec-instances-filtered-by-master-id.png + :alt: Instances configuration filtered by master nodes ID + ::: + + ::::{note} + The name of the instance configuration might differ depending on the cloud provider. + :::: + +4. Navigate to the **Performance** page of your deployment. Check if the CPU usage of your master nodes exceeds 85%. Your master node has the format `instance-``, where ``` is the ID of the master node. + +If you use [Stack Monitoring](https://www.elastic.co/guide/en/kibana/current/xpack-monitoring.html), open Kibana from your deployment page and select **Stack Monitoring** from the menu or the search bar. + +::::{note} +Stack Monitoring comes with out-of-the-box rules, but you need to enable them when prompted. +:::: + + +**Possible causes** + +* The master node is overwhelmed by a large number of snapshots or shards. +* The memory available on the master node is overwhelmed by these tasks: + + * External tasks initiated by clients + + * Index, search, update + * Frequent template updates due to the Beats configuration + + * Internal tasks initiated by users + + * Machine Learning jobs, watches, monitoring, ingest pipelines + + * Internal tasks initiated by {{es}} + + * Nodes joining and leaving due to hardware failures + * Shard allocation due to nodes joining and leaving + * Configuration of [ILM](/manage-data/lifecycle/index-lifecycle-management.md) policies. + + +**Resolutions** + +* Navigate to the **Edit** page of your deployment and increase the master node size. +* [Upgrade the cluster](/deploy-manage/upgrade/deployment-or-cluster.md) to the latest version. +* If the master node is overwhelmed by external tasks initiated by clients: + + * Reduce the request rate or pause ingesting, searching, or updating from the client. + * Enable ingest and search-based autoscaling. + * Stop Beats to avoid frequent template updates. + +* If the master node is overwhelmed by internal tasks initiated by users: + + * Check [cluster-level pending tasks](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-pending-tasks). + * Reduce the number of Machine Learning jobs or watches. + * Change the number of ingest pipelines or processors to use less memory. + +* If the master node is overwhelmed by internal tasks initiated by {{es}}: + + * For nodes joining and leaving, this should resolve itself. If increasing the master nodes size doesn’t resolve the issue, contact support. + * For shard allocation, inspect the progress of shards recovery. If there’s no progress, contact support. + + * Make sure `indices.recovery.max_concurrent_operations` is not aggressive, which could cause the master to be unavailable. + * Make sure `indices.recovery.max_bytes_per_sec` is set adequately to avoid impact on ingest and search workload. + + * Check [ILM](/manage-data/lifecycle/index-lifecycle-management.md) policies to avoid index rollover and relocate actions that are concurrent and aggressive. + +* If the master node is overwhelmed by a large number of snapshots, reduce the number of snapshots in the repo. +* If the master node is overwhelmed by a large number of shards, reduce the number of shards on the node. For more information, check [Size your shards](/deploy-manage/production-guidance/optimize-performance/size-shards.md). + + +## Some nodes are unavailable and are displayed as missing [ec-nodes-unavailable-missing] + +**Health check** + +* Use the [Metrics inventory](https://www.elastic.co/guide/en/observability/current/monitor-infrastructure-and-hosts.html) to identify unavailable or unhealthy nodes. If the number of minimum master nodes is down, {{es}} is not available. + +**Possible causes** + +* Hardware issue. +* Routing has stopped because of a previous ES configuration failure. +* Disk/memory/CPU are saturated. +* The network is saturated or disconnected. +* Nodes are unable to join. + +**Resolutions** + +* Hardware issue: Any unhealthy hardware detected by the platform is automatically vacated within the hour. If this doesn’t happen, contact support. +* Routing stopped: A failed {{es}} configuration might stop the nodes routing. Restart the routing manually to bring the node back to health. +* Disk/memory/CPU saturated: + + * [Delete unused data](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-delete). + * Increase disk size. + * [Enable autoscaling](/deploy-manage/autoscaling.md). + * Configuration of [ILM](/manage-data/lifecycle/index-lifecycle-management.md) policies. + * [Manage data tiers](/manage-data/lifecycle/data-tiers.md). + +* Network saturated or disconnected: Contact support. +* Nodes unable to join: Fix the {{es}} configuration. +* Nodes unable to join: Contact support. -% What needs to be done: Lift-and-shift -% Use migrated content from existing pages that map to this page: -% - [ ] ./raw-migrated-files/cloud/cloud/ec-scenario_why_is_my_node_unavailable.md -% Notes: - Lift and shift the first one (cloud) only -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/echscenario_why_is_my_node_unavailable.md -% Notes: dupe -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-single-node-deployment-disk-used.md -% Notes: dupe -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-multiple-node-deployment-disk-used.md -% Notes: dupe -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-jvm-heap-usage-exceed-allowed-threshold.md -% Notes: dupe -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-cpu-usage-exceed-allowed-threshold.md -% Notes: dupe -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-nodes-unavailable-missing.md % Notes: dupe % Internal links rely on the following IDs being on this page (e.g. as a heading ID, paragraph ID, etc): diff --git a/troubleshoot/monitoring/unavailable-shards.md b/troubleshoot/monitoring/unavailable-shards.md index f0950e0bbe..758ddfa3da 100644 --- a/troubleshoot/monitoring/unavailable-shards.md +++ b/troubleshoot/monitoring/unavailable-shards.md @@ -1,4 +1,5 @@ --- +navigation_title: "Unavailable shards" mapped_urls: - https://www.elastic.co/guide/en/cloud/current/ec-scenario_why_are_shards_unavailable.html - https://www.elastic.co/guide/en/cloud-heroku/current/echscenario_why_are_shards_unavailable.html @@ -7,65 +8,319 @@ mapped_urls: - https://www.elastic.co/guide/en/cloud-heroku/current/ech-remediate-issues-allocation-explain-API.html --- -# Unavailable shards +# Diagnose unavailable shards [ec-scenario_why_are_shards_unavailable] -% What needs to be done: Lift-and-shift +This section describes how to analyze unassigned shards using the Elasticsearch APIs and Kibana. -% Use migrated content from existing pages that map to this page: +* [Analyze unassigned shards using the Elasticsearch API](/troubleshoot/monitoring/unavailable-shards.md#ec-analyze_shards_with-api) +* [Analyze unassigned shards using the Kibana UI](/troubleshoot/monitoring/unavailable-shards.md#ec-analyze_shards_with-kibana) +* [Remediate common issues returned by the cluster allocation explain API](#ec-remediate-issues-allocation-explain-API) -% - [ ] ./raw-migrated-files/cloud/cloud/ec-scenario_why_are_shards_unavailable.md -% Notes: - Lift and shift the first one (cloud) only -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/echscenario_why_are_shards_unavailable.md -% Notes: dupe, redirect -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-api.md -% Notes: dupe, redirect -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-analyze_shards_with-kibana.md -% Notes: dupe, redirect -% - [ ] ./raw-migrated-files/cloud/cloud-heroku/ech-remediate-issues-allocation-explain-API.md -% Notes: dupe, redirect +{{es}} distributes the documents in an index across multiple shards and distributes copies of those shards across multiple nodes in the cluster. This both increases capacity and makes the cluster more resilient, ensuring your data remains available if a node goes down. -% Internal links rely on the following IDs being on this page (e.g. as a heading ID, paragraph ID, etc): +A healthy (green) cluster has a primary copy of each shard and the required number of replicas are assigned to different nodes in the cluster. -$$$ech-disk-full$$$ +If a cluster has unassigned replica shards, it is functional but vulnerable in the event of a failure. The cluster is unhealthy and reports a status of yellow. -$$$ech-node-moved-to-another-host$$$ +If a cluster has unassigned primary shards, some of your data is unavailable. The cluster is unhealthy and reports a status of red. -$$$ech-cannot-assign-shards-on-allocation-rule$$$ +A formerly-healthy cluster might have unassigned shards because nodes have dropped out or moved, are running out of disk space, or are hitting allocation limits. -$$$ech-eligible-data-nodes-less-than-replicas$$$ +If a cluster has unassigned shards, you might see an error message such as this on the Elastic Cloud console: -$$$ech-searchable-snapshot-indices-not-allocated$$$ +:::{image} /images/cloud-ec-unhealthy-deployment.png +:alt: Unhealthy deployment error message +::: -$$$ech-max-retry-exceeded$$$ +If your issue is not addressed here, then [contact Elastic support for help](/troubleshoot/index.md). -$$$ech-max-shard-per-node$$$ +## Analyze unassigned shards using the {{es}} API [ec-analyze_shards_with-api] -$$$ec-analyze_shards_with-api$$$ +You can retrieve information about the status of your cluster, indices, and shards using the {{es}} API. To access the API you can either use the [Kibana Dev Tools Console](/explore-analyze/query-filter/tools/console.md), or the [Elasticsearch API console](https://www.elastic.co/guide/en/cloud/current/ec-api-console.html). If you have your own way to run the {{es}} API, check [How to access the API](https://www.elastic.co/guide/en/cloud/current/ec-api-access.html). This section shows you how to: -$$$ec-analyze_shards_with-kibana$$$ +* [Check cluster health](/troubleshoot/monitoring/unavailable-shards.md#ec-check-cluster-health) +* [Check unhealthy indices](/troubleshoot/monitoring/unavailable-shards.md#ec-check-unhealthy-indices) +* [Check which shards are unassigned](/troubleshoot/monitoring/unavailable-shards.md#ec-check-which-unassigned-shards) +* [Check why shards are unassigned](/troubleshoot/monitoring/unavailable-shards.md#ec-check-why-unassigned-shards) +* [Check Elasticsearch cluster logs](/troubleshoot/monitoring/unavailable-shards.md#ec-check-es-cluster-logs) -$$$ec-cannot-assign-shards-on-allocation-rule$$$ -$$$ec-check-cluster-health$$$ +#### Check cluster health [ec-check-cluster-health] -$$$ec-check-es-cluster-logs$$$ +Use the [Cluster health API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-health): -$$$ec-check-unhealthy-indices$$$ +```json +GET /_cluster/health/ +``` -$$$ec-check-which-unassigned-shards$$$ +This command returns the cluster status (green, yellow, or red) and shows the number of unassigned shards: -$$$ec-check-why-unassigned-shards$$$ +```json +{ + "cluster_name" : "xxx", + "status" : "red", + "timed_out" : false, + "number_of_nodes" : "x", + "number_of_data_nodes" : "x", + "active_primary_shards" : 116, + "active_shards" : 229, + "relocating_shards" : 0, + "initializing_shards" : 0, + "unassigned_shards" : 1, + "delayed_unassigned_shards" : 0, + "number_of_pending_tasks" : 0, + "number_of_inflight_fetch" : 0, + "task_max_waiting_in_queue_millis" : 0, + "active_shards_percent_as_number" : 98.70689655172413 +} +``` -$$$ec-disk-full$$$ -$$$ec-eligible-data-nodes-less-than-replicas$$$ +#### Check unhealthy indices [ec-check-unhealthy-indices] -$$$ec-max-retry-exceeded$$$ +Use the [cat indices API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-indices) to get the status of individual indices. Specify the `health` parameter to limit the results to a particular status, for example `?v&health=red` or `?v&health=yellow`. -$$$ec-max-shard-per-node$$$ +```json +GET /_cat/indices?v&health=red +``` -$$$ec-node-moved-to-another-host$$$ +This command returns any indices that have unassigned primary shards (red status): -$$$ec-remediate-issues-allocation-explain-API$$$ +```json +red open filebeat-7.10.0-2022.01.07-000014 C7N8fxGwRxK0JcwXH18zVg 1 1 +red open filebeat-7.9.3-2022.01.07-000015 Ib4UIJNVTtOg6ovzs011Lq 1 1 +``` -$$$ec-searchable-snapshot-indices-not-allocated$$$ +For more information, refer to [Fix a red or yellow cluster status](/troubleshoot/elasticsearch/red-yellow-cluster-status.md#fix-red-yellow-cluster-status). + + +#### Check which shards are unassigned [ec-check-which-unassigned-shards] + +Use the [cat shards API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cat-shards): + +```json +GET /_cat/shards/?v +``` + +This command returns the index name, followed by the shard type and shard status: + +```json +filebeat-7.10.0-2022.01.07-000014 0 P UNASSIGNED +filebeat-7.9.3-2022.01.07-000015 1 P UNASSIGNED +filebeat-7.9.3-2022.01.07-000015 2 r UNASSIGNED +``` + + +#### Check why shards are unassigned [ec-check-why-unassigned-shards] + +To understand why shards are unassigned, run the [Cluster allocation explain API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-allocation-explain). + +Running the API call `GET _cluster/allocation/explain` retrieves an allocation explanation for unassigned primary shards, or replica shards. + +For example, if `_cat/health` shows that the primary shard of shard 1 in the `filebeat-7.9.3-2022.01.07-000015` index is unassigned, you can get the allocation explanation with the following request: + +```json +GET _cluster/allocation/explain +{ + "index": "filebeat-7.9.3-2022.01.07-000015", + "shard": 1, + "primary": true +} +``` + +The response is as follows: + +```json +{ + "index": "filebeat-7.9.3-2022.01.07-000015", + "shard": 1, + "primary": true, + "current_state": "unassigned", + "unassigned_info": { + "reason": "CLUSTER_RECOVERED", + "at": "2022-04-12T13:06:36.125Z", + "last_allocation_status": "no_valid_shard_copy" + }, + "can_allocate": "no_valid_shard_copy", + "allocate_explanation": "cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster", + "node_allocation_decisions": [ + { + "node_id": "xxxx", + "node_name": "instance-0000000005", + (... skip ...) + "node_decision": "no", + "store": { + "found": false + } + } + ] +} +``` + + +#### Check {{es}} cluster logs [ec-check-es-cluster-logs] + +To determine the allocation issue, you can [check the logs](/deploy-manage/monitor/stack-monitoring/elastic-cloud-stack-monitoring.md#ec-check-logs). This is easier if you have set up a dedicated monitoring deployment. + + +## Analyze unassigned shards using the Kibana UI [ec-analyze_shards_with-kibana] + +If you are shipping logs and metrics to a monitoring deployment, go through the following steps. + +1. Select your deployment from the {{es}} Service panel and navigate to the **Logs and metrics** page. +2. Click **Enable**. +3. Choose the deployment where to send your logs and metrics. +4. Click **Save**. It might take a few minutes to apply the configuration changes. +5. Click **View** to open the Kibana UI and get more details on metrics and logs. + +:::{image} /images/cloud-ec-logs-metrics-page.png +:alt: Log and metrics page +::: + +The unhealthy indices appear with a red or yellow status. + +:::{image} /images/cloud-ec-red-yellow-indices.png +:alt: Unhealthy indices in red or yellow status +::: + + +## Remediate common issues returned by the cluster allocation explain API [ec-remediate-issues-allocation-explain-API] + +Here’s how to resolve the most common causes of unassigned shards reported by the cluster allocation explain API. + +* [Disk is full](/troubleshoot/monitoring/unavailable-shards.md#ec-disk-full) +* [A node containing data has moved to a different host](/troubleshoot/monitoring/unavailable-shards.md#ec-node-moved-to-another-host) +* [Unable to assign shards based on the allocation rule](/troubleshoot/monitoring/unavailable-shards.md#ec-cannot-assign-shards-on-allocation-rule) +* [The number of eligible data nodes is less than the number of replicas](/troubleshoot/monitoring/unavailable-shards.md#ec-eligible-data-nodes-less-than-replicas) +* [A snapshot issue prevents searchable snapshot indices from being allocated](/troubleshoot/monitoring/unavailable-shards.md#ec-searchable-snapshot-indices-not-allocated) +* [Maximum retry times exceeded](/troubleshoot/monitoring/unavailable-shards.md#ec-max-retry-exceeded) +* [Max shard per node reached the limit](/troubleshoot/monitoring/unavailable-shards.md#ec-max-shard-per-node) + +If your issue is not addressed here, then [contact Elastic support for help](/troubleshoot/index.md). + +### Disk is full [ec-disk-full] + +**Symptom** + +If the disk usage exceeded the threshold, you may get one or more of the following messages: + +`the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=90%], using more disk space than the maximum allowed [90.0%], actual free: [9.273781776428223%]` + +`unable to force allocate shard to [%s] during replacement, as allocating to this node would cause disk usage to exceed 100%% ([%s] bytes above available disk space)` + +`the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=85%], using more disk space than the maximum allowed [85.0%], actual free: [14.119771122932434%]` + +`after allocating [[restored-xxx][0], node[null], [P], recovery_source[snapshot recovery [Om66xSJqTw2raoNyKxsNWg] from xxx/W5Yea4QuR2yyZ4iM44fumg], s[UNASSIGNED], unassigned_info[[reason=NEW_INDEX_RESTORED], at[2022-03-02T10:56:58.210Z], delayed=false, details[restore_source[xxx]], allocation_status[fetching_shard_data]]] node [GTXrECDRRmGkkAnB48hPqw] would have more than the allowed 10% free disk threshold (8.7% free), preventing allocation` + +**Resolutions** + +Review the topic for your deployment architecture: + +* [Full disk on single-node deployment](/troubleshoot/monitoring/unavailable-nodes.md#ec-single-node-deployment-disk-used) +* [Full disk on multiple-nodes deployment](/troubleshoot/monitoring/unavailable-nodes.md#ec-multiple-node-deployment-disk-used) + +To learn more, review the following topics: + +* [Cluster-level shard allocation and routing settings](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-cluster.html) +* [Fix watermark errors](/troubleshoot/elasticsearch/fix-watermark-errors.md) + + +### A node containing data has moved to a different host [ec-node-moved-to-another-host] + +**Symptom** + +During the routine system maintenance performed by Elastic, it might happen that a node moves to a different host. If the indices are not configured with replica shards, the shard data on the {{es}} node that is moved will be lost, and you might get one or more of these messages: + +`cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster` + +**Resolutions** + +Configure an [highly available cluster](/deploy-manage/production-guidance/plan-for-production-elastic-cloud.md) to keep your service running. Also, consider taking the following actions to bring your deployment back to health and recover your data from the snapshot. + +* [Close the red indices](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-close) +* [Restore the indices](/deploy-manage/tools/snapshot-and-restore.md) from the last successful snapshot + +For more information, check also [Designing for resilience](/deploy-manage/production-guidance/availability-and-resilience.md). + + +### Unable to assign shards based on the allocation rule [ec-cannot-assign-shards-on-allocation-rule] + +**Symptom** + +When shards cannot be assigned, due to [data tier allocation](/manage-data/lifecycle/data-tiers.md#data-tier-allocation) or [attribute-based allocation](/deploy-manage/distributed-architecture/shard-allocation-relocation-recovery/index-level-shard-allocation.md), you might get one or more of these messages: + +`node does not match index setting [index.routing.allocation.include] filters [node_type:\"cold\"]` + +`index has a preference for tiers [data_cold] and node does not meet the required [data_cold] tier` + +`index has a preference for tiers [data_cold,data_warm,data_hot] and node does not meet the required [data_cold] tier` + +`index has a preference for tiers [data_warm,data_hot] and node does not meet the required [data_warm] tier` + +`this node's data roles are exactly [data_frozen] so it may only hold shards from frozen searchable snapshots, but this index is not a frozen searchable snapshot` + +**Resolutions** + +* Make sure nodes are available in each data tier and have sufficient disk space. +* [Check the index settings](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-indices) and ensure shards can be allocated to the expected data tier. +* Check the [ILM policy](/manage-data/lifecycle/index-lifecycle-management.md) and check for issues with the [allocate action](https://www.elastic.co/guide/en/elasticsearch/reference/current/ilm-allocate.html). +* Inspect the [index templates](/manage-data/data-store/templates.md) and check for issues with the index settings. + + +### The number of eligible data nodes is less than the number of replicas [ec-eligible-data-nodes-less-than-replicas] + +**Symptom** + +Unassigned replica shards are often caused by there being fewer eligible data nodes than the configured number_of_replicas. + +**Resolutions** + +* Add more [eligible data nodes or more availability zones](/deploy-manage/deploy/elastic-cloud/ec-customize-deployment-components.md) to ensure resiliency. +* Adjust the `number_of_replicas` [setting](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-put-settings) for your indices to the number of eligible data nodes -1. + + +### A snapshot issue prevents searchable snapshot indices from being allocated [ec-searchable-snapshot-indices-not-allocated] + +**Symptom** + +Some snapshots operations might be impacted, as shown in the following example: + +`failed shard on node [Yc_Jbf73QVSVYSqZT8HPlA]: failed recovery, failure RecoveryFailedException[[restored-my_index-2021.32][1]: … SnapshotMissingException[[found-snapshots:2021.08.25-my_index-2021.32-default_policy-_j2k8it9qnehe1t-2k0u6a/iOAoyjWLTyytKkW3_wF1jw] is missing]; nested: NoSuchFileException[Blob object [snapshots/52bc3ae2030a4df8ab10559d1720a13c/indices/WRlkKDuPSLW__M56E8qbfA/1/snap-iOAoyjWLTyytKkW3_wF1jw.dat] not found: The specified key does not exist. (Service: Amazon S3; Status Code: 404; Error Code: NoSuchKey; Request ID: 4AMTM1XFMTV5F00V; S3 Extended Request ID:` + +**Resolutions** + +Upgrade to {{es}} version 7.17.0 or later, which resolves bugs that affected snapshot operations in earlier versions. Check [Upgrade versions](/deploy-manage/upgrade/deployment-or-cluster.md) for more details. + +If you can’t upgrade, you can recreate the snapshot repository as a workaround. + +The bugs also affect searchable snapshots. If you still have data in the cluster but cannot restore from the searchable snapshot, you can try reindexing and recreating the searchable snapshot: + +* Reindex all the affected indices to new regular indices +* Remove the affected frozen indices +* Take the snapshot and mount the indices again + + +### Max shard per node reached the limit [ec-max-shard-per-node] + +**Symptom** + +The parameter [`cluster.max_shards_per_node`](https://www.elastic.co/guide/en/elasticsearch/reference/current/misc-cluster-settings.html#cluster-max-shards-per-node) limits the total number of primary and replica shards for the cluster. If your cluster has a number of shards beyond this limit, you might get the following message: + +`Validation Failed: 1: this action would add [2] shards, but this cluster currently has [1000]/[1000] maximum normal shards open` + +**Resolutions** + +Delete unnecessary indices, add more data nodes, and [avoid oversharding](/deploy-manage/production-guidance/optimize-performance/size-shards.md) as too many shards can overwhelm your cluster. If you cannot take these actions, and you’re confident your changes won’t destabilize the cluster, you can temporarily increase the limit using the [cluster update settings API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-put-settings) and retry the action. For more details, check [Troubleshoot shard-related errors](/deploy-manage/production-guidance/optimize-performance/size-shards.md#troubleshoot-shard-related-errors). + + +### Maximum retry times exceeded [ec-max-retry-exceeded] + +**Symptom** + +The cluster will attempt to allocate a shard a few times, before giving up and leaving the shard unallocated. On {{es}} Service, `index.allocation.max_retries` defaults to 5. If allocation fails after the maximum number of retries, you might get the following message: + +`shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [%s]` + +**Resolutions** + +Run [`POST /_cluster/reroute?retry_failed=true`](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-reroute) API to retry. If it still fails, rerun the [Cluster allocation explain](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-cluster-allocation-explain) API to diagnose the problem. \ No newline at end of file diff --git a/troubleshoot/toc.yml b/troubleshoot/toc.yml index 0c7e894f8b..2755e74a01 100644 --- a/troubleshoot/toc.yml +++ b/troubleshoot/toc.yml @@ -141,9 +141,10 @@ toc: - file: monitoring/unavailable-nodes.md - file: monitoring/unavailable-shards.md - file: monitoring/performance.md - - file: monitoring/high-memory-pressure.md - - file: monitoring/high-availability.md - - file: monitoring/cluster-response-time.md + children: + - file: monitoring/high-memory-pressure.md + - file: monitoring/high-availability.md + - file: monitoring/cluster-response-time.md - file: monitoring/node-moves-outages.md - file: monitoring/deployment-health-warnings.md - file: monitoring/node-bootlooping.md From d21df071f0645a949b355b2c40029845facd2b31 Mon Sep 17 00:00:00 2001 From: Marci W <333176+marciw@users.noreply.github.com> Date: Mon, 17 Feb 2025 16:26:32 -0500 Subject: [PATCH 2/4] strike that, reverse it --- troubleshoot/monitoring/node-moves-outages.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/troubleshoot/monitoring/node-moves-outages.md b/troubleshoot/monitoring/node-moves-outages.md index 04f45c2de9..7300b596f1 100644 --- a/troubleshoot/monitoring/node-moves-outages.md +++ b/troubleshoot/monitoring/node-moves-outages.md @@ -1,10 +1,10 @@ --- -navigation_title: "Troubleshoot node moves and outages" +navigation_title: "Node moves and outages" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-deployment-node-move.html --- -# Node moves and outages [ec-deployment-node-move] +# Troubleshoot node moves and outages [ec-deployment-node-move] To ensure that your nodes are located on healthy hosts, we vacate nodes to perform routine system maintenance or to remove a host with hardware issues from service. From 7f7551737b2a6e75722f6c6e7819780187265895 Mon Sep 17 00:00:00 2001 From: Marci W <333176+marciw@users.noreply.github.com> Date: Mon, 17 Feb 2025 16:30:13 -0500 Subject: [PATCH 3/4] pesky relative links --- troubleshoot/monitoring/performance.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/troubleshoot/monitoring/performance.md b/troubleshoot/monitoring/performance.md index dcd472636a..776f2f7bf2 100644 --- a/troubleshoot/monitoring/performance.md +++ b/troubleshoot/monitoring/performance.md @@ -11,7 +11,7 @@ You have a smaller {{es}} cluster and you’ve noticed that performance seems to When you look in the **Cluster Performance Metrics** section of the [Elasticsearch Service Console](https://cloud.elastic.co?page=docs&placement=docs-body), you get the following metrics: -:::{image} ../../../images/cloud-metrics-credits.png +:::{image} /images/cloud-metrics-credits.png :alt: CPU usage versus CPU credits over time ::: From a31f11f23a5c2935d97d1740b77b7d64ca52bf09 Mon Sep 17 00:00:00 2001 From: Marci W <333176+marciw@users.noreply.github.com> Date: Mon, 17 Feb 2025 16:41:57 -0500 Subject: [PATCH 4/4] Done for now --- troubleshoot/monitoring/cloud.md | 16 ++++++++++------ troubleshoot/monitoring/high-availability.md | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/troubleshoot/monitoring/cloud.md b/troubleshoot/monitoring/cloud.md index 3f8c9ea237..1f62c78f3d 100644 --- a/troubleshoot/monitoring/cloud.md +++ b/troubleshoot/monitoring/cloud.md @@ -7,12 +7,16 @@ mapped_pages: Use the topics in this section to troubleshoot monitoring, including AutoOps. -% TODO topic links - - - - - +* [](/troubleshoot/monitoring/autoops.md) +* [](/troubleshoot/monitoring/unavailable-nodes.md) +* [](/troubleshoot/monitoring/unavailable-shards.md) +* [](/troubleshoot/monitoring/performance.md) +* [](/troubleshoot/monitoring/high-memory-pressure.md) +* [](/troubleshoot/monitoring/high-availability.md) +* [](/troubleshoot/monitoring/cluster-response-time.md) +* [](/troubleshoot/monitoring/node-moves-outages.md) +* [](/troubleshoot/monitoring/deployment-health-warnings.md) +* [](/troubleshoot/monitoring/node-bootlooping.md) diff --git a/troubleshoot/monitoring/high-availability.md b/troubleshoot/monitoring/high-availability.md index 081e2eaff1..e6fb0600f5 100644 --- a/troubleshoot/monitoring/high-availability.md +++ b/troubleshoot/monitoring/high-availability.md @@ -1,10 +1,11 @@ --- +navigation_title: "Cluster performance metrics" mapped_pages: - https://www.elastic.co/guide/en/cloud/current/ec-scenario_is_my_cluster_really_highly_available.html - https://www.elastic.co/guide/en/cloud-heroku/current/echscenario_is_my_cluster_really_highly_available.html --- -# Cluster performance metrics [ec-scenario_is_my_cluster_really_highly_available] +# Troubleshoot cluster availability using performance metrics [ec-scenario_is_my_cluster_really_highly_available] % TODO: Edit edit edit