Skip to content

Commit 82ac850

Browse files
authored
Fixing the conditions for fetching remote master history (#89472) (#90040)
Fixing the conditions that the health API uses to determine when to check with a master node for its view of master history if the master appears to have gone null repeatedly.
1 parent 70585a8 commit 82ac850

File tree

3 files changed

+19
-9
lines changed

3 files changed

+19
-9
lines changed

docs/changelog/89472.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 89472
2+
summary: Fixing the conditions for fetching remote master history
3+
area: Health
4+
type: bug
5+
issues:
6+
- 89431

server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -426,28 +426,24 @@ public void testRepeatedNullMasterRecognizedAsGreenIfMasterDoesNotKnowItIsUnstab
426426
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
427427
.build()
428428
);
429+
int nullTransitionsThreshold = 1;
429430
final List<String> dataNodes = internalCluster().startDataOnlyNodes(
430431
2,
431432
Settings.builder()
432433
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
433434
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
434-
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
435+
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), nullTransitionsThreshold)
435436
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(60, TimeUnit.SECONDS))
436437
.build()
437438
);
438439
ensureStableCluster(3);
439-
for (int i = 0; i < 2; i++) {
440+
for (int i = 0; i < nullTransitionsThreshold + 1; i++) {
440441
final String masterNode = masterNodes.get(0);
441442

442443
// Simulating a painful gc by suspending all threads for a long time on the current elected master node.
443444
SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(random(), masterNode);
444445

445446
final CountDownLatch dataNodeMasterSteppedDown = new CountDownLatch(2);
446-
internalCluster().getInstance(ClusterService.class, masterNode).addListener(event -> {
447-
if (event.state().nodes().getMasterNodeId() == null) {
448-
dataNodeMasterSteppedDown.countDown();
449-
}
450-
});
451447
internalCluster().getInstance(ClusterService.class, dataNodes.get(0)).addListener(event -> {
452448
if (event.state().nodes().getMasterNodeId() == null) {
453449
dataNodeMasterSteppedDown.countDown();
@@ -466,7 +462,7 @@ public void testRepeatedNullMasterRecognizedAsGreenIfMasterDoesNotKnowItIsUnstab
466462
// Stop disruption
467463
logger.info("--> unfreezing node [{}]", masterNode);
468464
masterNodeDisruption.stopDisrupting();
469-
ensureStableCluster(3);
465+
ensureStableCluster(3, TimeValue.timeValueSeconds(30), false, randomFrom(dataNodes));
470466
}
471467
assertGreenMasterStability(internalCluster().client(randomFrom(dataNodes)));
472468
}

server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,8 +442,16 @@ private boolean hasSeenMasterInHasMasterLookupTimeframe() {
442442
public void clusterChanged(ClusterChangedEvent event) {
443443
DiscoveryNode currentMaster = event.state().nodes().getMasterNode();
444444
DiscoveryNode previousMaster = event.previousState().nodes().getMasterNode();
445-
if (currentMaster == null && previousMaster != null) {
445+
if ((currentMaster == null && previousMaster != null) || (currentMaster != null && previousMaster == null)) {
446446
if (masterHistoryService.getLocalMasterHistory().hasMasterGoneNullAtLeastNTimes(unacceptableNullTransitions)) {
447+
/*
448+
* If the master node has been going to null repeatedly, we want to make a remote request to it to see what it thinks of
449+
* master stability. We want to query the most recent master whether the current master has just transitioned to null or
450+
* just transitioned from null to not null. The reason that we make the latter request is that sometimes when the elected
451+
* master goes to null the most recent master is not responsive for the duration of the request timeout (for example if
452+
* that node is in the middle of a long GC pause which would be both the reason for it not being master and the reason it
453+
* does not respond quickly to transport requests).
454+
*/
447455
DiscoveryNode master = masterHistoryService.getLocalMasterHistory().getMostRecentNonNullMaster();
448456
/*
449457
* If the most recent master was this box, there is no point in making a transport request -- we already know what this

0 commit comments

Comments
 (0)