Fix NPE when evaluating the disk health for non-data nodes (#92643) (#92648)

andreidan · web-flow · commit 169bb729bd15 · 2023-01-03T18:04:58.000Z
Non-data nodes are not part of the `RoutingTable` so the expression ``` clusterState.getRoutingNodes().node(nodeId).numberOfShardsWithState(ShardRoutingState.RELOCATING) > 0 ``` would yield NPE for dedicated non-data nodes. (cherry picked from commit 677766d) Signed-off-by: Andrei Dan <andrei.dan@elastic.co>
diff --git a/docs/changelog/92643.yaml b/docs/changelog/92643.yaml
@@ -0,0 +1,5 @@
+pr: 92643
+summary: Fix NPE when evaluating the disk health for non-data nodes
+area: Health
+type: bug
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/health/node/LocalHealthMonitor.java b/server/src/main/java/org/elasticsearch/health/node/LocalHealthMonitor.java
@@ -21,6 +21,7 @@
 import org.elasticsearch.cluster.ClusterStateListener;
 import org.elasticsearch.cluster.DiskUsage;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.routing.RoutingNode;
 import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.settings.ClusterSettings;
@@ -433,9 +434,18 @@ DiskHealthInfo getHealth(HealthMetadata healthMetadata, ClusterState clusterStat
             }
 
             long highThreshold = diskMetadata.getFreeBytesHighWatermark(totalBytes).getBytes();
-            if (usage.getFreeBytes() < highThreshold && hasRelocatingShards(clusterState, node.getId()) == false) {
-                logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
-                return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
+            if (usage.getFreeBytes() < highThreshold) {
+                if (node.canContainData()) {
+                    // for data nodes only report YELLOW if shards can't move away from the node
+                    if (DiskCheck.hasRelocatingShards(clusterState, node) == false) {
+                        logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
+                        return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
+                    }
+                } else {
+                    // for non-data nodes report YELLOW when the disk high watermark is breached
+                    logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
+                    return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
+                }
             }
             return new DiskHealthInfo(HealthStatus.GREEN);
         }
@@ -461,8 +471,13 @@ private DiskUsage getDiskUsage() {
             return DiskUsage.findLeastAvailablePath(nodeStats);
         }
 
-        private boolean hasRelocatingShards(ClusterState clusterState, String nodeId) {
-            return clusterState.getRoutingNodes().node(nodeId).shardsWithState(ShardRoutingState.RELOCATING).isEmpty() == false;
+        static boolean hasRelocatingShards(ClusterState clusterState, DiscoveryNode node) {
+            RoutingNode routingNode = clusterState.getRoutingNodes().node(node.getId());
+            if (routingNode == null) {
+                // routing node will be null for non-data nodes
+                return false;
+            }
+            return routingNode.numberOfShardsWithState(ShardRoutingState.RELOCATING) > 0;
         }
     }
 }
diff --git a/server/src/test/java/org/elasticsearch/health/node/LocalHealthMonitorTests.java b/server/src/test/java/org/elasticsearch/health/node/LocalHealthMonitorTests.java
@@ -20,6 +20,7 @@
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
@@ -43,7 +44,9 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 
+import static org.elasticsearch.action.support.replication.ClusterStateCreationUtils.state;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.nullValue;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.eq;
@@ -306,6 +309,49 @@ public void testFrozenRedDiskStatus() {
         assertThat(diskHealth, equalTo(new DiskHealthInfo(HealthStatus.RED, DiskHealthInfo.Cause.FROZEN_NODE_OVER_FLOOD_STAGE_THRESHOLD)));
     }
 
+    public void testYellowStatusForNonDataNode() {
+        DiscoveryNode dedicatedMasterNode = new DiscoveryNode(
+            "master-node",
+            "master-node-1",
+            ESTestCase.buildNewFakeTransportAddress(),
+            Collections.emptyMap(),
+            Set.of(DiscoveryNodeRole.MASTER_ROLE),
+            Version.CURRENT
+        );
+        clusterState = ClusterStateCreationUtils.state(
+            dedicatedMasterNode,
+            dedicatedMasterNode,
+            node,
+            new DiscoveryNode[] { node, dedicatedMasterNode }
+        ).copyAndUpdate(b -> b.putCustom(HealthMetadata.TYPE, healthMetadata));
+
+        initializeIncreasedDiskSpaceUsage();
+        LocalHealthMonitor.DiskCheck diskMonitor = new LocalHealthMonitor.DiskCheck(nodeService);
+        DiskHealthInfo diskHealth = diskMonitor.getHealth(healthMetadata, clusterState);
+        assertThat(diskHealth, equalTo(new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD)));
+    }
+
+    public void testHasRelocatingShards() {
+        String indexName = "my-index";
+        final ClusterState state = state(indexName, true, ShardRoutingState.RELOCATING);
+        // local node coincides with the node hosting the (relocating) primary shard
+        DiscoveryNode localNode = state.nodes().getLocalNode();
+        assertThat(LocalHealthMonitor.DiskCheck.hasRelocatingShards(state, localNode), is(true));
+
+        DiscoveryNode dedicatedMasterNode = new DiscoveryNode(
+            "master-node",
+            "master-node-1",
+            ESTestCase.buildNewFakeTransportAddress(),
+            Collections.emptyMap(),
+            Set.of(DiscoveryNodeRole.MASTER_ROLE),
+            Version.CURRENT
+        );
+        ClusterState newState = ClusterState.builder(state)
+            .nodes(new DiscoveryNodes.Builder(state.nodes()).add(dedicatedMasterNode))
+            .build();
+        assertThat(LocalHealthMonitor.DiskCheck.hasRelocatingShards(newState, dedicatedMasterNode), is(false));
+    }
+
     private void simulateDiskOutOfSpace() {
         when(
             nodeService.stats(