Skip to content

Commit ba5a9c8

Browse files
Report outcome of data node's shard snapshot master update (#123652)
Improves the information in the IndexShardSnapshotStatus's statusDescription field to include the success/failure of the remote call to the master node to update the shard snapshot state. This allows us to see if there is a discrepancy between the state of the data node and the master node. Closes ES-10991
1 parent 6bdb9fe commit ba5a9c8

File tree

2 files changed

+65
-24
lines changed

2 files changed

+65
-24
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,7 @@ public void testSnapshotShutdownProgressTracker() throws Exception {
597597
"SnapshotShutdownProgressTracker index shard snapshot messages",
598598
SnapshotShutdownProgressTracker.class.getCanonicalName(),
599599
Level.INFO,
600-
"statusDescription='finished: master notification attempt complete'"
600+
"*successfully sent shard snapshot state [PAUSED_FOR_NODE_REMOVAL] update to the master node*"
601601
)
602602
);
603603

server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java

Lines changed: 64 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
import java.util.Iterator;
6262
import java.util.List;
6363
import java.util.Map;
64-
import java.util.function.Supplier;
64+
import java.util.function.Consumer;
6565

6666
import static java.util.Collections.emptyMap;
6767
import static org.elasticsearch.core.Strings.format;
@@ -340,7 +340,8 @@ private void handleUpdatedSnapshotsInProgressEntry(String localNodeId, boolean r
340340
ShardState.FAILED,
341341
shard.getValue().reason(),
342342
shard.getValue().generation(),
343-
() -> null
343+
// Shard snapshot never began, so there is no status object to update.
344+
(outcomeInfoString) -> {}
344345
);
345346
}
346347
} else {
@@ -403,7 +404,6 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
403404
for (final Map.Entry<ShardId, ShardSnapshotStatus> shardEntry : entry.shards().entrySet()) {
404405
final ShardId shardId = shardEntry.getKey();
405406
final ShardSnapshotStatus masterShardSnapshotStatus = shardEntry.getValue();
406-
IndexShardSnapshotStatus indexShardSnapshotStatus = localShardSnapshots.get(shardId);
407407

408408
if (masterShardSnapshotStatus.state() != ShardState.INIT) {
409409
// shard snapshot not currently scheduled by master
@@ -424,10 +424,8 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
424424
ShardState.PAUSED_FOR_NODE_REMOVAL,
425425
"paused",
426426
masterShardSnapshotStatus.generation(),
427-
() -> {
428-
indexShardSnapshotStatus.updateStatusDescription("finished: master notification attempt complete");
429-
return null;
430-
}
427+
// Shard snapshot never began, so there is no status object to update
428+
(outcomeInfoString) -> {}
431429
);
432430
} else {
433431
// shard snapshot currently running, mark for pause
@@ -444,9 +442,8 @@ private Runnable newShardSnapshotTask(
444442
final IndexVersion entryVersion,
445443
final long entryStartTime
446444
) {
447-
Supplier<Void> postMasterNotificationAction = () -> {
448-
snapshotStatus.updateStatusDescription("finished: master notification attempt complete");
449-
return null;
445+
Consumer<String> postMasterNotificationAction = (outcomeInfoString) -> {
446+
snapshotStatus.updateStatusDescription("Data node shard snapshot finished. Remote master update outcome: " + outcomeInfoString);
450447
};
451448

452449
// Listener that runs on completion of the shard snapshot: it will notify the master node of success or failure.
@@ -687,6 +684,8 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
687684
if (masterShard != null && masterShard.state().completed() == false) {
688685
final IndexShardSnapshotStatus.Copy indexShardSnapshotStatus = localShard.getValue().asCopy();
689686
final Stage stage = indexShardSnapshotStatus.getStage();
687+
final String statusDescription = indexShardSnapshotStatus.getStatusDescription();
688+
final int maxStatusAppend = 1000;
690689
// Master knows about the shard and thinks it has not completed
691690
if (stage == Stage.DONE) {
692691
// but we think the shard is done - we need to make new master know that the shard is done
@@ -700,7 +699,20 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
700699
snapshot.snapshot(),
701700
shardId,
702701
localShard.getValue().getShardSnapshotResult(),
703-
() -> null
702+
(outcomeInfoString) -> localShard.getValue()
703+
.updateStatusDescription(
704+
Strings.format(
705+
"""
706+
Data node already successfully finished shard snapshot, but a new master needed to be
707+
notified. New remote master notification outcome: [%s]. The prior shard snapshot status
708+
description was [%s]
709+
""",
710+
outcomeInfoString,
711+
statusDescription.length() < maxStatusAppend
712+
? statusDescription
713+
: statusDescription.substring(0, maxStatusAppend)
714+
)
715+
)
704716
);
705717
} else if (stage == Stage.FAILURE) {
706718
// but we think the shard failed - we need to make new master know that the shard failed
@@ -716,7 +728,21 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
716728
ShardState.FAILED,
717729
indexShardSnapshotStatus.getFailure(),
718730
localShard.getValue().generation(),
719-
() -> null
731+
// Update the original statusDescription with the latest remote master call outcome, but include the old
732+
// response. This will allow us to see when/whether the information reached the previous and current master.
733+
(outcomeInfoString) -> localShard.getValue()
734+
.updateStatusDescription(
735+
Strings.format(
736+
"""
737+
Data node already failed shard snapshot, but a new master needed to be notified. New remote
738+
master notification outcome: [%s]. The prior shard snapshot status description was [%s]
739+
""",
740+
outcomeInfoString,
741+
statusDescription.length() < maxStatusAppend
742+
? statusDescription
743+
: statusDescription.substring(0, maxStatusAppend)
744+
)
745+
)
720746
);
721747
} else if (stage == Stage.PAUSED) {
722748
// but we think the shard has paused - we need to make new master know that
@@ -730,7 +756,19 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
730756
ShardState.PAUSED_FOR_NODE_REMOVAL,
731757
indexShardSnapshotStatus.getFailure(),
732758
localShard.getValue().generation(),
733-
() -> null
759+
(outcomeInfoString) -> localShard.getValue()
760+
.updateStatusDescription(
761+
Strings.format(
762+
"""
763+
Data node already paused shard snapshot, but a new master needed to be notified. New remote
764+
master notification outcome: [%s]. The prior shard snapshot status description was [%s]
765+
""",
766+
outcomeInfoString,
767+
statusDescription.length() < maxStatusAppend
768+
? statusDescription
769+
: statusDescription.substring(0, maxStatusAppend)
770+
)
771+
)
734772
);
735773
}
736774
}
@@ -747,7 +785,7 @@ private void notifySuccessfulSnapshotShard(
747785
final Snapshot snapshot,
748786
final ShardId shardId,
749787
ShardSnapshotResult shardSnapshotResult,
750-
Supplier<Void> postMasterNotificationAction
788+
Consumer<String> postMasterNotificationAction
751789
) {
752790
assert shardSnapshotResult != null;
753791
assert shardSnapshotResult.getGeneration() != null;
@@ -768,7 +806,7 @@ private void notifyUnsuccessfulSnapshotShard(
768806
final ShardState shardState,
769807
final String failure,
770808
final ShardGeneration generation,
771-
Supplier<Void> postMasterNotificationAction
809+
Consumer<String> postMasterNotificationAction
772810
) {
773811
assert shardState == ShardState.FAILED || shardState == ShardState.PAUSED_FOR_NODE_REMOVAL : shardState;
774812
sendSnapshotShardUpdate(
@@ -792,29 +830,32 @@ private void sendSnapshotShardUpdate(
792830
final Snapshot snapshot,
793831
final ShardId shardId,
794832
final ShardSnapshotStatus status,
795-
Supplier<Void> postMasterNotificationAction
833+
Consumer<String> postMasterNotificationAction
796834
) {
835+
snapshotShutdownProgressTracker.trackRequestSentToMaster(snapshot, shardId);
797836
ActionListener<Void> updateResultListener = new ActionListener<>() {
798837
@Override
799838
public void onResponse(Void aVoid) {
839+
snapshotShutdownProgressTracker.releaseRequestSentToMaster(snapshot, shardId);
840+
postMasterNotificationAction.accept(
841+
Strings.format("successfully sent shard snapshot state [%s] update to the master node", status.state())
842+
);
800843
logger.trace("[{}][{}] updated snapshot state to [{}]", shardId, snapshot, status);
801844
}
802845

803846
@Override
804847
public void onFailure(Exception e) {
848+
snapshotShutdownProgressTracker.releaseRequestSentToMaster(snapshot, shardId);
849+
postMasterNotificationAction.accept(
850+
Strings.format("exception trying to send shard snapshot state [%s] update to the master node [%s]", status.state(), e)
851+
);
805852
logger.warn(() -> format("[%s][%s] failed to update snapshot state to [%s]", shardId, snapshot, status), e);
806853
}
807854
};
808855

809-
snapshotShutdownProgressTracker.trackRequestSentToMaster(snapshot, shardId);
810-
var releaseTrackerRequestRunsBeforeResultListener = ActionListener.runBefore(updateResultListener, () -> {
811-
snapshotShutdownProgressTracker.releaseRequestSentToMaster(snapshot, shardId);
812-
postMasterNotificationAction.get();
813-
});
814-
815856
remoteFailedRequestDeduplicator.executeOnce(
816857
new UpdateIndexShardSnapshotStatusRequest(snapshot, shardId, status),
817-
releaseTrackerRequestRunsBeforeResultListener,
858+
updateResultListener,
818859
(req, reqListener) -> transportService.sendRequest(
819860
transportService.getLocalNode(),
820861
SnapshotsService.UPDATE_SNAPSHOT_STATUS_ACTION_NAME,

0 commit comments

Comments
 (0)