Skip to content

Commit 169bb72

Browse files
authored
Fix NPE when evaluating the disk health for non-data nodes (#92643) (#92648)
Non-data nodes are not part of the `RoutingTable` so the expression ``` clusterState.getRoutingNodes().node(nodeId).numberOfShardsWithState(ShardRoutingState.RELOCATING) > 0 ``` would yield NPE for dedicated non-data nodes. (cherry picked from commit 677766d) Signed-off-by: Andrei Dan <[email protected]>
1 parent 1de71c0 commit 169bb72

File tree

3 files changed

+71
-5
lines changed

3 files changed

+71
-5
lines changed

docs/changelog/92643.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 92643
2+
summary: Fix NPE when evaluating the disk health for non-data nodes
3+
area: Health
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/health/node/LocalHealthMonitor.java

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.elasticsearch.cluster.ClusterStateListener;
2222
import org.elasticsearch.cluster.DiskUsage;
2323
import org.elasticsearch.cluster.node.DiscoveryNode;
24+
import org.elasticsearch.cluster.routing.RoutingNode;
2425
import org.elasticsearch.cluster.routing.ShardRoutingState;
2526
import org.elasticsearch.cluster.service.ClusterService;
2627
import org.elasticsearch.common.settings.ClusterSettings;
@@ -433,9 +434,18 @@ DiskHealthInfo getHealth(HealthMetadata healthMetadata, ClusterState clusterStat
433434
}
434435

435436
long highThreshold = diskMetadata.getFreeBytesHighWatermark(totalBytes).getBytes();
436-
if (usage.getFreeBytes() < highThreshold && hasRelocatingShards(clusterState, node.getId()) == false) {
437-
logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
438-
return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
437+
if (usage.getFreeBytes() < highThreshold) {
438+
if (node.canContainData()) {
439+
// for data nodes only report YELLOW if shards can't move away from the node
440+
if (DiskCheck.hasRelocatingShards(clusterState, node) == false) {
441+
logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
442+
return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
443+
}
444+
} else {
445+
// for non-data nodes report YELLOW when the disk high watermark is breached
446+
logger.debug("High disk watermark [{}] exceeded on {}", highThreshold, usage);
447+
return new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD);
448+
}
439449
}
440450
return new DiskHealthInfo(HealthStatus.GREEN);
441451
}
@@ -461,8 +471,13 @@ private DiskUsage getDiskUsage() {
461471
return DiskUsage.findLeastAvailablePath(nodeStats);
462472
}
463473

464-
private boolean hasRelocatingShards(ClusterState clusterState, String nodeId) {
465-
return clusterState.getRoutingNodes().node(nodeId).shardsWithState(ShardRoutingState.RELOCATING).isEmpty() == false;
474+
static boolean hasRelocatingShards(ClusterState clusterState, DiscoveryNode node) {
475+
RoutingNode routingNode = clusterState.getRoutingNodes().node(node.getId());
476+
if (routingNode == null) {
477+
// routing node will be null for non-data nodes
478+
return false;
479+
}
480+
return routingNode.numberOfShardsWithState(ShardRoutingState.RELOCATING) > 0;
466481
}
467482
}
468483
}

server/src/test/java/org/elasticsearch/health/node/LocalHealthMonitorTests.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.elasticsearch.cluster.node.DiscoveryNode;
2121
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
2222
import org.elasticsearch.cluster.node.DiscoveryNodes;
23+
import org.elasticsearch.cluster.routing.ShardRoutingState;
2324
import org.elasticsearch.cluster.service.ClusterService;
2425
import org.elasticsearch.common.settings.ClusterSettings;
2526
import org.elasticsearch.common.settings.Settings;
@@ -43,7 +44,9 @@
4344
import java.util.concurrent.atomic.AtomicInteger;
4445
import java.util.concurrent.atomic.AtomicReference;
4546

47+
import static org.elasticsearch.action.support.replication.ClusterStateCreationUtils.state;
4648
import static org.hamcrest.Matchers.equalTo;
49+
import static org.hamcrest.Matchers.is;
4750
import static org.hamcrest.Matchers.nullValue;
4851
import static org.mockito.ArgumentMatchers.any;
4952
import static org.mockito.ArgumentMatchers.eq;
@@ -306,6 +309,49 @@ public void testFrozenRedDiskStatus() {
306309
assertThat(diskHealth, equalTo(new DiskHealthInfo(HealthStatus.RED, DiskHealthInfo.Cause.FROZEN_NODE_OVER_FLOOD_STAGE_THRESHOLD)));
307310
}
308311

312+
public void testYellowStatusForNonDataNode() {
313+
DiscoveryNode dedicatedMasterNode = new DiscoveryNode(
314+
"master-node",
315+
"master-node-1",
316+
ESTestCase.buildNewFakeTransportAddress(),
317+
Collections.emptyMap(),
318+
Set.of(DiscoveryNodeRole.MASTER_ROLE),
319+
Version.CURRENT
320+
);
321+
clusterState = ClusterStateCreationUtils.state(
322+
dedicatedMasterNode,
323+
dedicatedMasterNode,
324+
node,
325+
new DiscoveryNode[] { node, dedicatedMasterNode }
326+
).copyAndUpdate(b -> b.putCustom(HealthMetadata.TYPE, healthMetadata));
327+
328+
initializeIncreasedDiskSpaceUsage();
329+
LocalHealthMonitor.DiskCheck diskMonitor = new LocalHealthMonitor.DiskCheck(nodeService);
330+
DiskHealthInfo diskHealth = diskMonitor.getHealth(healthMetadata, clusterState);
331+
assertThat(diskHealth, equalTo(new DiskHealthInfo(HealthStatus.YELLOW, DiskHealthInfo.Cause.NODE_OVER_HIGH_THRESHOLD)));
332+
}
333+
334+
public void testHasRelocatingShards() {
335+
String indexName = "my-index";
336+
final ClusterState state = state(indexName, true, ShardRoutingState.RELOCATING);
337+
// local node coincides with the node hosting the (relocating) primary shard
338+
DiscoveryNode localNode = state.nodes().getLocalNode();
339+
assertThat(LocalHealthMonitor.DiskCheck.hasRelocatingShards(state, localNode), is(true));
340+
341+
DiscoveryNode dedicatedMasterNode = new DiscoveryNode(
342+
"master-node",
343+
"master-node-1",
344+
ESTestCase.buildNewFakeTransportAddress(),
345+
Collections.emptyMap(),
346+
Set.of(DiscoveryNodeRole.MASTER_ROLE),
347+
Version.CURRENT
348+
);
349+
ClusterState newState = ClusterState.builder(state)
350+
.nodes(new DiscoveryNodes.Builder(state.nodes()).add(dedicatedMasterNode))
351+
.build();
352+
assertThat(LocalHealthMonitor.DiskCheck.hasRelocatingShards(newState, dedicatedMasterNode), is(false));
353+
}
354+
309355
private void simulateDiskOutOfSpace() {
310356
when(
311357
nodeService.stats(

0 commit comments

Comments
 (0)