-
Notifications
You must be signed in to change notification settings - Fork 25.5k
Replace pre publication failed to commit cluster state exceptions #135706
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
0a0418d
f2c836d
350da55
eec4c8f
812ce64
8df45c6
01ffa86
4911606
b76a1e5
7376b9d
4152c42
0eb27a8
f886182
a277c2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
import org.elasticsearch.cluster.ClusterStatePublicationEvent; | ||
import org.elasticsearch.cluster.ClusterStateUpdateTask; | ||
import org.elasticsearch.cluster.LocalMasterServiceTask; | ||
import org.elasticsearch.cluster.NotMasterException; | ||
import org.elasticsearch.cluster.block.ClusterBlocks; | ||
import org.elasticsearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState; | ||
import org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfigExclusion; | ||
|
@@ -1552,7 +1553,7 @@ public void publish( | |
clusterStatePublicationEvent.getNewState().term() | ||
) | ||
); | ||
throw new FailedToCommitClusterStateException( | ||
throw new NotMasterException( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed as part of #135548 and will disappear when I rebase |
||
"node is no longer master for term " | ||
+ clusterStatePublicationEvent.getNewState().term() | ||
+ " while handling publication" | ||
|
@@ -1567,7 +1568,7 @@ public void publish( | |
clusterStatePublicationEvent.getSummary() | ||
) | ||
); | ||
throw new FailedToCommitClusterStateException("publication " + currentPublication.get() + " already in progress"); | ||
throw new FailedToPublishClusterStateException("publication " + currentPublication.get() + " already in progress"); | ||
} | ||
|
||
assert assertPreviousStateConsistency(clusterStatePublicationEvent); | ||
|
@@ -1586,7 +1587,7 @@ assert getLocalNode().equals(clusterState.getNodes().get(getLocalNode().getId()) | |
} catch (Exception e) { | ||
logger.debug(() -> "[" + clusterStatePublicationEvent.getSummary() + "] publishing failed during context creation", e); | ||
becomeCandidate("publication context creation"); | ||
throw new FailedToCommitClusterStateException("publishing failed during context creation", e); | ||
throw new FailedToPublishClusterStateException("publishing failed during context creation", e); | ||
} | ||
|
||
try (Releasable ignored = publicationContext::decRef) { | ||
|
@@ -1607,7 +1608,7 @@ assert getLocalNode().equals(clusterState.getNodes().get(getLocalNode().getId()) | |
e | ||
); | ||
becomeCandidate("publication creation"); | ||
throw new FailedToCommitClusterStateException("publishing failed while starting", e); | ||
throw new FailedToPublishClusterStateException("publishing failed while starting", e); | ||
} | ||
|
||
try { | ||
|
@@ -1638,12 +1639,12 @@ assert getLocalNode().equals(clusterState.getNodes().get(getLocalNode().getId()) | |
} | ||
} | ||
} | ||
} catch (FailedToCommitClusterStateException failedToCommitClusterStateException) { | ||
publishListener.onFailure(failedToCommitClusterStateException); | ||
} catch (FailedToPublishClusterStateException | FailedToCommitClusterStateException | NotMasterException e) { | ||
publishListener.onFailure(e); | ||
} catch (Exception e) { | ||
assert false : e; // all exceptions should already be caught and wrapped in a FailedToCommitClusterStateException | ||
assert false : e; // all exceptions should already be caught and wrapped in a FailedToPublishClusterStateException | | ||
logger.error(() -> "[" + clusterStatePublicationEvent.getSummary() + "] publishing unexpectedly failed", e); | ||
publishListener.onFailure(new FailedToCommitClusterStateException("publishing unexpectedly failed", e)); | ||
publishListener.onFailure(new FailedToPublishClusterStateException("publishing unexpectedly failed", e)); | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -9,15 +9,23 @@ | |||||
package org.elasticsearch.cluster.coordination; | ||||||
|
||||||
import org.elasticsearch.ElasticsearchException; | ||||||
import org.elasticsearch.action.support.master.TransportMasterNodeAction; | ||||||
import org.elasticsearch.common.io.stream.StreamInput; | ||||||
|
||||||
import java.io.IOException; | ||||||
|
||||||
/** | ||||||
* Thrown when a cluster state publication fails to commit the new cluster state. If publication fails then a new master is elected but the | ||||||
* update might or might not take effect, depending on whether or not the newly-elected master accepted the published state that failed to | ||||||
* be committed. | ||||||
* | ||||||
* Exception indicating a cluster state update was published but not committed to all nodes. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Conceptual nit: "committed" is a global property rather than something that happens on one or more nodes.
Suggested change
This exception indicates the publishing master doesn't think the update was committed, but it cannot tell for sure. It depends on which other master nodes accepted it and the winner of the next election. |
||||||
* <p> | ||||||
* If this exception is thrown, then the cluster state update was published, but is not guaranteed | ||||||
* to be committed on any nodes, including the next master node. This exception should only be thrown when there is | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
* <i>ambiguity</i> whether a cluster state update has been committed. | ||||||
* <p> | ||||||
* For exceptions thrown prior to publication, | ||||||
* when the cluster update has <i>definitely</i> failed, use a {@link FailedToPublishClusterStateException}. | ||||||
* <p> | ||||||
* This is a retryable exception inside {@link TransportMasterNodeAction} | ||||||
* <p> | ||||||
* See {@link ClusterStatePublisher} for more details. | ||||||
*/ | ||||||
public class FailedToCommitClusterStateException extends ElasticsearchException { | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the "Elastic License | ||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
* Public License v 1"; you may not use this file except in compliance with, at | ||
* your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
* License v3.0 only", or the "Server Side Public License, v 1". | ||
*/ | ||
package org.elasticsearch.cluster.coordination; | ||
|
||
import org.elasticsearch.ElasticsearchException; | ||
import org.elasticsearch.action.support.master.TransportMasterNodeAction; | ||
import org.elasticsearch.common.io.stream.StreamInput; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* Exception indicating a cluster state update failed prior to publication. | ||
* <p> | ||
* If this exception is thrown, then the cluster state update was <i>not</i> published to any node. | ||
* It is therefore impossible for the new master to have committed this state. | ||
* <p> | ||
* For exceptions thrown <i>after</i> publication, when the cluster state update may or may not have been committed, | ||
* use a {@link FailedToCommitClusterStateException}. | ||
* <p> | ||
* This is a retryable exception inside {@link TransportMasterNodeAction} | ||
*/ | ||
public class FailedToPublishClusterStateException extends ElasticsearchException { | ||
|
||
|
||
public FailedToPublishClusterStateException(String msg) { | ||
super(msg); | ||
} | ||
|
||
public FailedToPublishClusterStateException(StreamInput in) throws IOException { | ||
super(in); | ||
} | ||
|
||
public FailedToPublishClusterStateException(String msg, Throwable cause, Object... args) { | ||
super(msg, cause, args); | ||
} | ||
|
||
@Override | ||
public Throwable fillInStackTrace() { | ||
return this; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
import org.elasticsearch.cluster.NotMasterException; | ||
import org.elasticsearch.cluster.coordination.ClusterStatePublisher; | ||
import org.elasticsearch.cluster.coordination.FailedToCommitClusterStateException; | ||
import org.elasticsearch.cluster.coordination.FailedToPublishClusterStateException; | ||
import org.elasticsearch.cluster.metadata.ProcessClusterEventTimeoutException; | ||
import org.elasticsearch.cluster.metadata.ProjectMetadata; | ||
import org.elasticsearch.cluster.node.DiscoveryNode; | ||
|
@@ -415,13 +416,37 @@ public void onResponse(Void unused) { | |
|
||
@Override | ||
public void onFailure(Exception exception) { | ||
if (exception instanceof FailedToCommitClusterStateException failedToCommitClusterStateException) { | ||
if (exception instanceof FailedToPublishClusterStateException | ||
|| exception instanceof FailedToCommitClusterStateException | ||
|| exception instanceof NotMasterException) { | ||
final long notificationStartTime = threadPool.rawRelativeTimeInMillis(); | ||
final long version = newClusterState.version(); | ||
logger.warn(() -> format("failing [%s]: failed to commit cluster state version [%s]", summary, version), exception); | ||
|
||
if (exception instanceof FailedToCommitClusterStateException) { | ||
logger.warn( | ||
() -> format("failing [%s]: failed to commit cluster state version [%s]", summary, version), | ||
exception | ||
); | ||
} else if (exception instanceof FailedToPublishClusterStateException) { | ||
logger.warn( | ||
() -> format("failing [%s]: failed to publish cluster state version [%s]", summary, version), | ||
exception | ||
); | ||
} else { | ||
logger.debug( | ||
() -> format( | ||
"node is no longer the master prior to publication of cluster state version [%s]: [%s]", | ||
version, | ||
summary | ||
), | ||
exception | ||
); | ||
} | ||
|
||
for (final var executionResult : executionResults) { | ||
executionResult.onPublishFailure(failedToCommitClusterStateException); | ||
executionResult.onPublishFailure(exception); | ||
} | ||
|
||
final long notificationMillis = threadPool.rawRelativeTimeInMillis() - notificationStartTime; | ||
clusterStateUpdateStatsTracker.onPublicationFailure( | ||
threadPool.rawRelativeTimeInMillis(), | ||
|
@@ -985,11 +1010,17 @@ void onClusterStateUnchanged(ClusterState clusterState) { | |
} | ||
} | ||
|
||
void onPublishFailure(FailedToCommitClusterStateException e) { | ||
void onPublishFailure(Exception e) { | ||
if (publishedStateConsumer == null && onPublicationSuccess == null) { | ||
assert failure != null; | ||
var taskFailure = failure; | ||
failure = new FailedToCommitClusterStateException(e.getMessage(), e); | ||
|
||
if (e instanceof FailedToCommitClusterStateException) { | ||
failure = new FailedToCommitClusterStateException(e.getMessage(), e); | ||
} else { | ||
failure = new NotMasterException(e.getMessage(), e); | ||
} | ||
Comment on lines
1011
to
1015
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also be handling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes! Good catch - I realised I'm missing code here, and in a few other places too |
||
|
||
failure.addSuppressed(taskFailure); | ||
notifyFailure(); | ||
return; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
roles_security_stats,9176000 | ||
failed_to_publish_cluster_state_exception,9183000 |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed as part of #135548 and will disappear once I rebase