-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Closed
Closed
Copy link
Labels
:Data Management/ILM+SLMIndex and Snapshot lifecycle managementIndex and Snapshot lifecycle management>bugTeam:Data ManagementMeta label for data/management teamMeta label for data/management team
Description
Elasticsearch Version
8.15.3
Installed Plugins
No response
Java Version
bundled
OS Version
ESS
Problem Description
Related to:
- Allow changing index settings of a searchable snapshot in the frozen tier #90871
- Ignore
total_shards_per_node
setting on searchable snapshots in frozen #97979
The allocate action appears to take place after the searchable snapshot action. This means a setting like total_shards_per_node
(e.g. set at index creation time) can cause an index to become stuck in the cold phase and unable to complete its searchable snapshot action if the number of cold nodes can't accommodate total_shards_per_node
.
Steps to Reproduce
Set up a 4x node cluster: 3x hot nodes + 1x cold node.
# Set ILM poll interval to 10s for faster testing
PUT _cluster/settings
{
"persistent": {
"indices.lifecycle.poll_interval": "10s"
}
}
# Create ILM policy
PUT _ilm/policy/test-policy
{
"policy": {
"phases": {
"hot": {
"actions": {
"rollover": {
"max_docs": 3
},
"set_priority": {
"priority": 100
}
},
"min_age": "0ms"
},
"cold": {
"min_age": "15s",
"actions": {
"allocate": {
"total_shards_per_node": 3
},
"searchable_snapshot": {
"snapshot_repository": "found-snapshots"
}
}
}
}
}
}
# Create index template w/ ILM policy attached.
PUT _index_template/test-template
{
"template": {
"settings": {
"index": {
"lifecycle": {
"name": "test-policy",
"rollover_alias": "test"
},
"number_of_replicas": "0",
"number_of_shards": "3",
"routing.allocation.total_shards_per_node": "1"
}
}
},
"index_patterns": [
"test-*"
],
"composed_of": []
}
# Bootstrap the first index
PUT test-000001
{
"aliases": {
"test": {
"is_write_index": true
}
}
}
# Index some data
POST _bulk?refresh=wait_for
{ "index" : { "_index" : "test" } }
{ "field" : "Hello World!" }
{ "index" : { "_index" : "test" } }
{ "field" : "Hello World!" }
{ "index" : { "_index" : "test" } }
{ "field" : "Hello World!" }
# Confirm rollover after ~10s
GET _cat/indices/*test*?v
# Check allocation of our searchable snapshot
GET _cat/shards/restored-test-000001?v
index shard prirep state docs store dataset ip node
restored-test-000001 0 p STARTED 0 227b 227b 10.46.66.98 instance-0000000003
restored-test-000001 1 p UNASSIGNED
restored-test-000001 2 p UNASSIGNED
# Check allocation explain
GET _cluster/allocation/explain
{
"index": "restored-test-000001",
"shard": 1,
"primary": true
}
{
"index": "restored-test-000001",
"shard": 1,
"primary": true,
"current_state": "unassigned",
"unassigned_info": {
"reason": "NEW_INDEX_RESTORED",
"at": "2024-10-23T22:00:03.477Z",
"details": "restore_source[found-snapshots/2024.10.23-test-000001-test-policy-74cxzb2lsrimys49turtmg]",
"last_allocation_status": "no"
},
"can_allocate": "no",
"allocate_explanation": "Elasticsearch isn't allowed to allocate this shard to any of the nodes in the cluster. Choose a node to which you expect this shard to be allocated, find this node in the node-by-node explanation, and address the reasons which prevent Elasticsearch from allocating this shard there.",
"node_allocation_decisions": [
{
"node_id": "29eoo3ieTSCKiO7UtSxMmA",
"node_name": "instance-0000000002",
"transport_address": "10.46.65.154:19583",
"node_attributes": {
"logical_availability_zone": "zone-2",
"availability_zone": "us-east4-b",
"instance_configuration": "gcp.es.datahot.n2.68x32x45",
"region": "unknown-region",
"server_name": "instance-0000000002.0fef76a1c07c49d29a1b3dd34e5fae4d",
"transform.config_version": "10.0.0",
"xpack.installed": "true",
"ml.config_version": "12.0.0",
"data": "hot"
},
"roles": [
"data_content",
"data_hot",
"ingest",
"master",
"remote_cluster_client",
"transform"
],
"node_decision": "no",
"deciders": [
{
"decider": "data_tier",
"decision": "NO",
"explanation": "index has a preference for tiers [data_cold,data_warm,data_hot] and node does not meet the required [data_cold] tier"
}
]
},
{
"node_id": "3idS6nroSrGp0AHT3HVsAg",
"node_name": "instance-0000000001",
"transport_address": "10.46.64.24:19733",
"node_attributes": {
"logical_availability_zone": "zone-1",
"availability_zone": "us-east4-a",
"instance_configuration": "gcp.es.datahot.n2.68x32x45",
"region": "unknown-region",
"server_name": "instance-0000000001.0fef76a1c07c49d29a1b3dd34e5fae4d",
"xpack.installed": "true",
"transform.config_version": "10.0.0",
"ml.config_version": "12.0.0",
"data": "hot"
},
"roles": [
"data_content",
"data_hot",
"ingest",
"master",
"remote_cluster_client",
"transform"
],
"node_decision": "no",
"deciders": [
{
"decider": "data_tier",
"decision": "NO",
"explanation": "index has a preference for tiers [data_cold,data_warm,data_hot] and node does not meet the required [data_cold] tier"
}
]
},
{
"node_id": "aLXOkK2dShSR9AzI94EoGQ",
"node_name": "instance-0000000003",
"transport_address": "10.46.66.98:19272",
"node_attributes": {
"logical_availability_zone": "zone-0",
"availability_zone": "us-east4-a",
"instance_configuration": "gcp.es.datacold.n2.68x10x190",
"region": "unknown-region",
"server_name": "instance-0000000003.0fef76a1c07c49d29a1b3dd34e5fae4d",
"transform.config_version": "10.0.0",
"xpack.installed": "true",
"ml.config_version": "12.0.0",
"data": "cold"
},
"roles": [
"data_cold",
"remote_cluster_client"
],
"node_decision": "no",
"deciders": [
{
"decider": "shards_limit",
"decision": "NO",
"explanation": "too many shards [1] allocated to this node for index [restored-test-000001], index setting [index.routing.allocation.total_shards_per_node=1]"
}
]
},
{
"node_id": "tKgoJAjoTTar3do03IeJxw",
"node_name": "instance-0000000000",
"transport_address": "10.46.66.23:19721",
"node_attributes": {
"logical_availability_zone": "zone-0",
"availability_zone": "us-east4-c",
"instance_configuration": "gcp.es.datahot.n2.68x32x45",
"region": "unknown-region",
"server_name": "instance-0000000000.0fef76a1c07c49d29a1b3dd34e5fae4d",
"transform.config_version": "10.0.0",
"xpack.installed": "true",
"ml.config_version": "12.0.0",
"data": "hot"
},
"roles": [
"data_content",
"data_hot",
"ingest",
"master",
"remote_cluster_client",
"transform"
],
"node_decision": "no",
"deciders": [
{
"decider": "data_tier",
"decision": "NO",
"explanation": "index has a preference for tiers [data_cold,data_warm,data_hot] and node does not meet the required [data_cold] tier"
}
]
}
]
}
We can unstick the index by clearing total_shards_per_node
PUT restored-test-000001/_settings
{
"index.routing.allocation.total_shards_per_node": null
}
We can also add an allocate
--> total_shards_per_node
in a warm phase as a workaround.
Logs (if relevant)
No response
Metadata
Metadata
Assignees
Labels
:Data Management/ILM+SLMIndex and Snapshot lifecycle managementIndex and Snapshot lifecycle management>bugTeam:Data ManagementMeta label for data/management teamMeta label for data/management team