Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ public class ClusterFormationFailureHelper {
private final Runnable logLastFailedJoinAttempt;
@Nullable // if no warning is scheduled
private volatile WarningScheduler warningScheduler;
private volatile boolean loggingEnabled;

/**
* Works with the {@link JoinHelper} to log the latest node-join attempt failure and cluster state debug information. Must call
Expand All @@ -90,6 +91,11 @@ public ClusterFormationFailureHelper(
this.clusterCoordinationExecutor = threadPool.executor(Names.CLUSTER_COORDINATION);
this.clusterFormationWarningTimeout = DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING.get(settings);
this.logLastFailedJoinAttempt = logLastFailedJoinAttempt;
this.loggingEnabled = true;
}

public void setLoggingEnabled(boolean enabled) {
this.loggingEnabled = enabled;
Comment on lines +97 to +98
Copy link
Contributor Author

@JeremyDahlgren JeremyDahlgren Mar 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially it seemed that a

    synchronized(mutex) {
        clusterFormationFailureHelper.stop();
    }

block in Coordinator::doStop() would be sufficient, but besides needing to hold the lock when the Node is being shut down, it seems a race condition could exist where the failure helper is stopped on shutdown but another thread immediately calls becomeCandidate() and starts it back up?

Also if calls to ClusterFormationFailureHelper start() and stop() were not synchronized it looks like there is the potential for a NullPointerException in start() if the warningScheduler is set to null between the assignment of a new WarningScheduler instance and the next line that calls scheduleNextWarning (Intellij warns about this):

    public void start() {
        assert warningScheduler == null;
        warningScheduler = new WarningScheduler();
        warningScheduler.scheduleNextWarning();
    }

    public void stop() {
        warningScheduler = null;
    }

}

public boolean isRunning() {
Expand All @@ -98,7 +104,7 @@ public boolean isRunning() {

/**
* Schedules a warning debug message to be logged in 'clusterFormationWarningTimeout' time, and periodically thereafter, until
* {@link ClusterFormationState#stop()} has been called.
* {@link ClusterFormationFailureHelper#stop()} has been called.
*/
public void start() {
assert warningScheduler == null;
Expand All @@ -125,7 +131,7 @@ public void onFailure(Exception e) {

@Override
protected void doRun() {
if (isActive()) {
if (isActive() && loggingEnabled) {
logLastFailedJoinAttempt.run();
logger.warn(
"{}; for troubleshooting guidance, see {}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,7 @@ protected void doStart() {
applierState = initialState;
clusterApplier.setInitialState(initialState);
}
clusterFormationFailureHelper.setLoggingEnabled(true);
}

public DiscoveryStats stats() {
Expand All @@ -1126,6 +1127,7 @@ public void startInitialJoin() {
protected void doStop() {
configuredHostsResolver.stop();
joinValidationService.stop();
clusterFormationFailureHelper.setLoggingEnabled(false);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,45 @@ public void testScheduling() {

assertThat(warningCount.get(), is(5L));
assertThat(logLastFailedJoinAttemptWarningCount.get(), is(5L));

// Temporarily disable logging and verify we don't get incremented logging counts.
clusterFormationFailureHelper.setLoggingEnabled(false);
warningCount.set(0);
logLastFailedJoinAttemptWarningCount.set(0);
clusterFormationFailureHelper.start();
clusterFormationFailureHelper.stop();
clusterFormationFailureHelper.start();
final long thirdStartTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();

while (deterministicTaskQueue.getCurrentTimeMillis() - thirdStartTimeMillis < 5 * expectedDelayMillis) {
assertTrue(clusterFormationFailureHelper.isRunning());
if (deterministicTaskQueue.hasRunnableTasks()) {
deterministicTaskQueue.runRandomTask();
} else {
deterministicTaskQueue.advanceTime();
}
}

assertThat(warningCount.get(), is(0L));
assertThat(logLastFailedJoinAttemptWarningCount.get(), is(0L));

// Re-enable logging and verify the logging counts again.
clusterFormationFailureHelper.stop();
clusterFormationFailureHelper.start();
clusterFormationFailureHelper.setLoggingEnabled(true);
final long fourthStartTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();

while (warningCount.get() < 5) {
assertTrue(clusterFormationFailureHelper.isRunning());
if (deterministicTaskQueue.hasRunnableTasks()) {
deterministicTaskQueue.runRandomTask();
} else {
deterministicTaskQueue.advanceTime();
}
}
assertThat(deterministicTaskQueue.getCurrentTimeMillis() - fourthStartTimeMillis, equalTo(5 * expectedDelayMillis));
assertThat(warningCount.get(), is(5L));
assertThat(logLastFailedJoinAttemptWarningCount.get(), is(5L));
}

public void testDescriptionOnMasterIneligibleNodes() {
Expand Down
Loading