Skip to content

Commit f482392

Browse files
Report outcome of data node's shard snapshot master update
Closes ES-10991
1 parent 79c388a commit f482392

File tree

1 file changed

+49
-23
lines changed

1 file changed

+49
-23
lines changed

server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
import java.util.Iterator;
6262
import java.util.List;
6363
import java.util.Map;
64-
import java.util.function.Supplier;
64+
import java.util.function.Consumer;
6565

6666
import static java.util.Collections.emptyMap;
6767
import static org.elasticsearch.core.Strings.format;
@@ -332,7 +332,8 @@ private void handleUpdatedSnapshotsInProgressEntry(String localNodeId, boolean r
332332
ShardState.FAILED,
333333
shard.getValue().reason(),
334334
shard.getValue().generation(),
335-
() -> null
335+
// Shard snapshot never began, so there is no status object to update.
336+
(outcomeInfoString) -> {}
336337
);
337338
}
338339
} else {
@@ -395,7 +396,6 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
395396
for (final Map.Entry<ShardId, ShardSnapshotStatus> shardEntry : entry.shards().entrySet()) {
396397
final ShardId shardId = shardEntry.getKey();
397398
final ShardSnapshotStatus masterShardSnapshotStatus = shardEntry.getValue();
398-
IndexShardSnapshotStatus indexShardSnapshotStatus = localShardSnapshots.get(shardId);
399399

400400
if (masterShardSnapshotStatus.state() != ShardState.INIT) {
401401
// shard snapshot not currently scheduled by master
@@ -416,10 +416,8 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
416416
ShardState.PAUSED_FOR_NODE_REMOVAL,
417417
"paused",
418418
masterShardSnapshotStatus.generation(),
419-
() -> {
420-
indexShardSnapshotStatus.updateStatusDescription("finished: master notification attempt complete");
421-
return null;
422-
}
419+
// Shard snapshot never began, so there is no status object to update
420+
(outcomeInfoString) -> {}
423421
);
424422
} else {
425423
// shard snapshot currently running, mark for pause
@@ -436,9 +434,8 @@ private Runnable newShardSnapshotTask(
436434
final IndexVersion entryVersion,
437435
final long entryStartTime
438436
) {
439-
Supplier<Void> postMasterNotificationAction = () -> {
440-
snapshotStatus.updateStatusDescription("finished: master notification attempt complete");
441-
return null;
437+
Consumer<String> postMasterNotificationAction = (outcomeInfoString) -> {
438+
snapshotStatus.updateStatusDescription("Data node shard snapshot finished. Remote master update outcome: " + outcomeInfoString);
442439
};
443440

444441
// Listener that runs on completion of the shard snapshot: it will notify the master node of success or failure.
@@ -692,7 +689,15 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
692689
snapshot.snapshot(),
693690
shardId,
694691
localShard.getValue().getShardSnapshotResult(),
695-
() -> null
692+
(outcomeInfoString) -> localShard.getValue().updateStatusDescription(
693+
Strings.format("""
694+
Data node already successfully finished shard snapshot, but a new master needed to be notified. New
695+
remote master notification outcome: [%s]. The prior shard snapshot status description was [%s]
696+
""",
697+
outcomeInfoString,
698+
indexShardSnapshotStatus.getStatusDescription()
699+
)
700+
)
696701
);
697702
} else if (stage == Stage.FAILURE) {
698703
// but we think the shard failed - we need to make new master know that the shard failed
@@ -708,7 +713,17 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
708713
ShardState.FAILED,
709714
indexShardSnapshotStatus.getFailure(),
710715
localShard.getValue().generation(),
711-
() -> null
716+
// Update the original statusDescription with the latest remote master call outcome, but include the old
717+
// response. This will allow us to see when/whether the information reached the previous and current master.
718+
(outcomeInfoString) -> localShard.getValue().updateStatusDescription(
719+
Strings.format("""
720+
Data node already failed shard snapshot, but a new master needed to be notified. New remote master
721+
notification outcome: [%s]. The prior shard snapshot status description was [%s]
722+
""",
723+
outcomeInfoString,
724+
indexShardSnapshotStatus.getStatusDescription()
725+
)
726+
)
712727
);
713728
} else if (stage == Stage.PAUSED) {
714729
// but we think the shard has paused - we need to make new master know that
@@ -722,7 +737,16 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
722737
ShardState.PAUSED_FOR_NODE_REMOVAL,
723738
indexShardSnapshotStatus.getFailure(),
724739
localShard.getValue().generation(),
725-
() -> null
740+
(outcomeInfoString) -> localShard.getValue()
741+
.updateStatusDescription(
742+
Strings.format("""
743+
Data node already paused shard snapshot, but a new master needed to be notified. New remote
744+
master notification outcome: [%s]. The prior shard snapshot status description was [%s]
745+
""",
746+
outcomeInfoString,
747+
indexShardSnapshotStatus.getStatusDescription()
748+
)
749+
)
726750
);
727751
}
728752
}
@@ -739,7 +763,7 @@ private void notifySuccessfulSnapshotShard(
739763
final Snapshot snapshot,
740764
final ShardId shardId,
741765
ShardSnapshotResult shardSnapshotResult,
742-
Supplier<Void> postMasterNotificationAction
766+
Consumer<String> postMasterNotificationAction
743767
) {
744768
assert shardSnapshotResult != null;
745769
assert shardSnapshotResult.getGeneration() != null;
@@ -760,7 +784,7 @@ private void notifyUnsuccessfulSnapshotShard(
760784
final ShardState shardState,
761785
final String failure,
762786
final ShardGeneration generation,
763-
Supplier<Void> postMasterNotificationAction
787+
Consumer<String> postMasterNotificationAction
764788
) {
765789
assert shardState == ShardState.FAILED || shardState == ShardState.PAUSED_FOR_NODE_REMOVAL : shardState;
766790
sendSnapshotShardUpdate(
@@ -784,29 +808,31 @@ private void sendSnapshotShardUpdate(
784808
final Snapshot snapshot,
785809
final ShardId shardId,
786810
final ShardSnapshotStatus status,
787-
Supplier<Void> postMasterNotificationAction
811+
Consumer<String> postMasterNotificationAction
788812
) {
789813
ActionListener<Void> updateResultListener = new ActionListener<>() {
790814
@Override
791815
public void onResponse(Void aVoid) {
816+
snapshotShutdownProgressTracker.releaseRequestSentToMaster(snapshot, shardId);
817+
postMasterNotificationAction.accept(
818+
Strings.format("successfully sent shard snapshot state [%s] update to the master node", status.state())
819+
);
792820
logger.trace("[{}][{}] updated snapshot state to [{}]", shardId, snapshot, status);
793821
}
794822

795823
@Override
796824
public void onFailure(Exception e) {
825+
snapshotShutdownProgressTracker.releaseRequestSentToMaster(snapshot, shardId);
826+
postMasterNotificationAction.accept(
827+
Strings.format("exception trying to send shard snapshot state [%s] update to the master node [%s]", status.state(), e)
828+
);
797829
logger.warn(() -> format("[%s][%s] failed to update snapshot state to [%s]", shardId, snapshot, status), e);
798830
}
799831
};
800832

801-
snapshotShutdownProgressTracker.trackRequestSentToMaster(snapshot, shardId);
802-
var releaseTrackerRequestRunsBeforeResultListener = ActionListener.runBefore(updateResultListener, () -> {
803-
snapshotShutdownProgressTracker.releaseRequestSentToMaster(snapshot, shardId);
804-
postMasterNotificationAction.get();
805-
});
806-
807833
remoteFailedRequestDeduplicator.executeOnce(
808834
new UpdateIndexShardSnapshotStatusRequest(snapshot, shardId, status),
809-
releaseTrackerRequestRunsBeforeResultListener,
835+
updateResultListener,
810836
(req, reqListener) -> transportService.sendRequest(
811837
transportService.getLocalNode(),
812838
SnapshotsService.UPDATE_SNAPSHOT_STATUS_ACTION_NAME,

0 commit comments

Comments
 (0)