6161import java .util .Iterator ;
6262import java .util .List ;
6363import java .util .Map ;
64- import java .util .function .Supplier ;
64+ import java .util .function .Consumer ;
6565
6666import static java .util .Collections .emptyMap ;
6767import static org .elasticsearch .core .Strings .format ;
@@ -332,7 +332,8 @@ private void handleUpdatedSnapshotsInProgressEntry(String localNodeId, boolean r
332332 ShardState .FAILED ,
333333 shard .getValue ().reason (),
334334 shard .getValue ().generation (),
335- () -> null
335+ // Shard snapshot never began, so there is no status object to update.
336+ (outcomeInfoString ) -> {}
336337 );
337338 }
338339 } else {
@@ -395,7 +396,6 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
395396 for (final Map .Entry <ShardId , ShardSnapshotStatus > shardEntry : entry .shards ().entrySet ()) {
396397 final ShardId shardId = shardEntry .getKey ();
397398 final ShardSnapshotStatus masterShardSnapshotStatus = shardEntry .getValue ();
398- IndexShardSnapshotStatus indexShardSnapshotStatus = localShardSnapshots .get (shardId );
399399
400400 if (masterShardSnapshotStatus .state () != ShardState .INIT ) {
401401 // shard snapshot not currently scheduled by master
@@ -416,10 +416,8 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
416416 ShardState .PAUSED_FOR_NODE_REMOVAL ,
417417 "paused" ,
418418 masterShardSnapshotStatus .generation (),
419- () -> {
420- indexShardSnapshotStatus .updateStatusDescription ("finished: master notification attempt complete" );
421- return null ;
422- }
419+ // Shard snapshot never began, so there is no status object to update
420+ (outcomeInfoString ) -> {}
423421 );
424422 } else {
425423 // shard snapshot currently running, mark for pause
@@ -436,9 +434,8 @@ private Runnable newShardSnapshotTask(
436434 final IndexVersion entryVersion ,
437435 final long entryStartTime
438436 ) {
439- Supplier <Void > postMasterNotificationAction = () -> {
440- snapshotStatus .updateStatusDescription ("finished: master notification attempt complete" );
441- return null ;
437+ Consumer <String > postMasterNotificationAction = (outcomeInfoString ) -> {
438+ snapshotStatus .updateStatusDescription ("Data node shard snapshot finished. Remote master update outcome: " + outcomeInfoString );
442439 };
443440
444441 // Listener that runs on completion of the shard snapshot: it will notify the master node of success or failure.
@@ -692,7 +689,15 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
692689 snapshot .snapshot (),
693690 shardId ,
694691 localShard .getValue ().getShardSnapshotResult (),
695- () -> null
692+ (outcomeInfoString ) -> localShard .getValue ().updateStatusDescription (
693+ Strings .format ("""
694+ Data node already successfully finished shard snapshot, but a new master needed to be notified. New
695+ remote master notification outcome: [%s]. The prior shard snapshot status description was [%s]
696+ """ ,
697+ outcomeInfoString ,
698+ indexShardSnapshotStatus .getStatusDescription ()
699+ )
700+ )
696701 );
697702 } else if (stage == Stage .FAILURE ) {
698703 // but we think the shard failed - we need to make new master know that the shard failed
@@ -708,7 +713,17 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
708713 ShardState .FAILED ,
709714 indexShardSnapshotStatus .getFailure (),
710715 localShard .getValue ().generation (),
711- () -> null
716+ // Update the original statusDescription with the latest remote master call outcome, but include the old
717+ // response. This will allow us to see when/whether the information reached the previous and current master.
718+ (outcomeInfoString ) -> localShard .getValue ().updateStatusDescription (
719+ Strings .format ("""
720+ Data node already failed shard snapshot, but a new master needed to be notified. New remote master
721+ notification outcome: [%s]. The prior shard snapshot status description was [%s]
722+ """ ,
723+ outcomeInfoString ,
724+ indexShardSnapshotStatus .getStatusDescription ()
725+ )
726+ )
712727 );
713728 } else if (stage == Stage .PAUSED ) {
714729 // but we think the shard has paused - we need to make new master know that
@@ -722,7 +737,16 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
722737 ShardState .PAUSED_FOR_NODE_REMOVAL ,
723738 indexShardSnapshotStatus .getFailure (),
724739 localShard .getValue ().generation (),
725- () -> null
740+ (outcomeInfoString ) -> localShard .getValue ()
741+ .updateStatusDescription (
742+ Strings .format ("""
743+ Data node already paused shard snapshot, but a new master needed to be notified. New remote
744+ master notification outcome: [%s]. The prior shard snapshot status description was [%s]
745+ """ ,
746+ outcomeInfoString ,
747+ indexShardSnapshotStatus .getStatusDescription ()
748+ )
749+ )
726750 );
727751 }
728752 }
@@ -739,7 +763,7 @@ private void notifySuccessfulSnapshotShard(
739763 final Snapshot snapshot ,
740764 final ShardId shardId ,
741765 ShardSnapshotResult shardSnapshotResult ,
742- Supplier < Void > postMasterNotificationAction
766+ Consumer < String > postMasterNotificationAction
743767 ) {
744768 assert shardSnapshotResult != null ;
745769 assert shardSnapshotResult .getGeneration () != null ;
@@ -760,7 +784,7 @@ private void notifyUnsuccessfulSnapshotShard(
760784 final ShardState shardState ,
761785 final String failure ,
762786 final ShardGeneration generation ,
763- Supplier < Void > postMasterNotificationAction
787+ Consumer < String > postMasterNotificationAction
764788 ) {
765789 assert shardState == ShardState .FAILED || shardState == ShardState .PAUSED_FOR_NODE_REMOVAL : shardState ;
766790 sendSnapshotShardUpdate (
@@ -784,29 +808,31 @@ private void sendSnapshotShardUpdate(
784808 final Snapshot snapshot ,
785809 final ShardId shardId ,
786810 final ShardSnapshotStatus status ,
787- Supplier < Void > postMasterNotificationAction
811+ Consumer < String > postMasterNotificationAction
788812 ) {
789813 ActionListener <Void > updateResultListener = new ActionListener <>() {
790814 @ Override
791815 public void onResponse (Void aVoid ) {
816+ snapshotShutdownProgressTracker .releaseRequestSentToMaster (snapshot , shardId );
817+ postMasterNotificationAction .accept (
818+ Strings .format ("successfully sent shard snapshot state [%s] update to the master node" , status .state ())
819+ );
792820 logger .trace ("[{}][{}] updated snapshot state to [{}]" , shardId , snapshot , status );
793821 }
794822
795823 @ Override
796824 public void onFailure (Exception e ) {
825+ snapshotShutdownProgressTracker .releaseRequestSentToMaster (snapshot , shardId );
826+ postMasterNotificationAction .accept (
827+ Strings .format ("exception trying to send shard snapshot state [%s] update to the master node [%s]" , status .state (), e )
828+ );
797829 logger .warn (() -> format ("[%s][%s] failed to update snapshot state to [%s]" , shardId , snapshot , status ), e );
798830 }
799831 };
800832
801- snapshotShutdownProgressTracker .trackRequestSentToMaster (snapshot , shardId );
802- var releaseTrackerRequestRunsBeforeResultListener = ActionListener .runBefore (updateResultListener , () -> {
803- snapshotShutdownProgressTracker .releaseRequestSentToMaster (snapshot , shardId );
804- postMasterNotificationAction .get ();
805- });
806-
807833 remoteFailedRequestDeduplicator .executeOnce (
808834 new UpdateIndexShardSnapshotStatusRequest (snapshot , shardId , status ),
809- releaseTrackerRequestRunsBeforeResultListener ,
835+ updateResultListener ,
810836 (req , reqListener ) -> transportService .sendRequest (
811837 transportService .getLocalNode (),
812838 SnapshotsService .UPDATE_SNAPSHOT_STATUS_ACTION_NAME ,
0 commit comments