6161import java .util .Iterator ;
6262import java .util .List ;
6363import java .util .Map ;
64- import java .util .function .Supplier ;
64+ import java .util .function .Consumer ;
6565
6666import static java .util .Collections .emptyMap ;
6767import static org .elasticsearch .core .Strings .format ;
@@ -340,7 +340,8 @@ private void handleUpdatedSnapshotsInProgressEntry(String localNodeId, boolean r
340340 ShardState .FAILED ,
341341 shard .getValue ().reason (),
342342 shard .getValue ().generation (),
343- () -> null
343+ // Shard snapshot never began, so there is no status object to update.
344+ (outcomeInfoString ) -> {}
344345 );
345346 }
346347 } else {
@@ -403,7 +404,6 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
403404 for (final Map .Entry <ShardId , ShardSnapshotStatus > shardEntry : entry .shards ().entrySet ()) {
404405 final ShardId shardId = shardEntry .getKey ();
405406 final ShardSnapshotStatus masterShardSnapshotStatus = shardEntry .getValue ();
406- IndexShardSnapshotStatus indexShardSnapshotStatus = localShardSnapshots .get (shardId );
407407
408408 if (masterShardSnapshotStatus .state () != ShardState .INIT ) {
409409 // shard snapshot not currently scheduled by master
@@ -424,10 +424,8 @@ private void pauseShardSnapshotsForNodeRemoval(String localNodeId, SnapshotsInPr
424424 ShardState .PAUSED_FOR_NODE_REMOVAL ,
425425 "paused" ,
426426 masterShardSnapshotStatus .generation (),
427- () -> {
428- indexShardSnapshotStatus .updateStatusDescription ("finished: master notification attempt complete" );
429- return null ;
430- }
427+ // Shard snapshot never began, so there is no status object to update
428+ (outcomeInfoString ) -> {}
431429 );
432430 } else {
433431 // shard snapshot currently running, mark for pause
@@ -444,9 +442,8 @@ private Runnable newShardSnapshotTask(
444442 final IndexVersion entryVersion ,
445443 final long entryStartTime
446444 ) {
447- Supplier <Void > postMasterNotificationAction = () -> {
448- snapshotStatus .updateStatusDescription ("finished: master notification attempt complete" );
449- return null ;
445+ Consumer <String > postMasterNotificationAction = (outcomeInfoString ) -> {
446+ snapshotStatus .updateStatusDescription ("Data node shard snapshot finished. Remote master update outcome: " + outcomeInfoString );
450447 };
451448
452449 // Listener that runs on completion of the shard snapshot: it will notify the master node of success or failure.
@@ -687,6 +684,8 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
687684 if (masterShard != null && masterShard .state ().completed () == false ) {
688685 final IndexShardSnapshotStatus .Copy indexShardSnapshotStatus = localShard .getValue ().asCopy ();
689686 final Stage stage = indexShardSnapshotStatus .getStage ();
687+ final String statusDescription = indexShardSnapshotStatus .getStatusDescription ();
688+ final int maxStatusAppend = 1000 ;
690689 // Master knows about the shard and thinks it has not completed
691690 if (stage == Stage .DONE ) {
692691 // but we think the shard is done - we need to make new master know that the shard is done
@@ -700,7 +699,20 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
700699 snapshot .snapshot (),
701700 shardId ,
702701 localShard .getValue ().getShardSnapshotResult (),
703- () -> null
702+ (outcomeInfoString ) -> localShard .getValue ()
703+ .updateStatusDescription (
704+ Strings .format (
705+ """
706+ Data node already successfully finished shard snapshot, but a new master needed to be
707+ notified. New remote master notification outcome: [%s]. The prior shard snapshot status
708+ description was [%s]
709+ """ ,
710+ outcomeInfoString ,
711+ statusDescription .length () < maxStatusAppend
712+ ? statusDescription
713+ : statusDescription .substring (0 , maxStatusAppend )
714+ )
715+ )
704716 );
705717 } else if (stage == Stage .FAILURE ) {
706718 // but we think the shard failed - we need to make new master know that the shard failed
@@ -716,7 +728,21 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
716728 ShardState .FAILED ,
717729 indexShardSnapshotStatus .getFailure (),
718730 localShard .getValue ().generation (),
719- () -> null
731+ // Update the original statusDescription with the latest remote master call outcome, but include the old
732+ // response. This will allow us to see when/whether the information reached the previous and current master.
733+ (outcomeInfoString ) -> localShard .getValue ()
734+ .updateStatusDescription (
735+ Strings .format (
736+ """
737+ Data node already failed shard snapshot, but a new master needed to be notified. New remote
738+ master notification outcome: [%s]. The prior shard snapshot status description was [%s]
739+ """ ,
740+ outcomeInfoString ,
741+ statusDescription .length () < maxStatusAppend
742+ ? statusDescription
743+ : statusDescription .substring (0 , maxStatusAppend )
744+ )
745+ )
720746 );
721747 } else if (stage == Stage .PAUSED ) {
722748 // but we think the shard has paused - we need to make new master know that
@@ -730,7 +756,19 @@ private void syncShardStatsOnNewMaster(List<SnapshotsInProgress.Entry> entries)
730756 ShardState .PAUSED_FOR_NODE_REMOVAL ,
731757 indexShardSnapshotStatus .getFailure (),
732758 localShard .getValue ().generation (),
733- () -> null
759+ (outcomeInfoString ) -> localShard .getValue ()
760+ .updateStatusDescription (
761+ Strings .format (
762+ """
763+ Data node already paused shard snapshot, but a new master needed to be notified. New remote
764+ master notification outcome: [%s]. The prior shard snapshot status description was [%s]
765+ """ ,
766+ outcomeInfoString ,
767+ statusDescription .length () < maxStatusAppend
768+ ? statusDescription
769+ : statusDescription .substring (0 , maxStatusAppend )
770+ )
771+ )
734772 );
735773 }
736774 }
@@ -747,7 +785,7 @@ private void notifySuccessfulSnapshotShard(
747785 final Snapshot snapshot ,
748786 final ShardId shardId ,
749787 ShardSnapshotResult shardSnapshotResult ,
750- Supplier < Void > postMasterNotificationAction
788+ Consumer < String > postMasterNotificationAction
751789 ) {
752790 assert shardSnapshotResult != null ;
753791 assert shardSnapshotResult .getGeneration () != null ;
@@ -768,7 +806,7 @@ private void notifyUnsuccessfulSnapshotShard(
768806 final ShardState shardState ,
769807 final String failure ,
770808 final ShardGeneration generation ,
771- Supplier < Void > postMasterNotificationAction
809+ Consumer < String > postMasterNotificationAction
772810 ) {
773811 assert shardState == ShardState .FAILED || shardState == ShardState .PAUSED_FOR_NODE_REMOVAL : shardState ;
774812 sendSnapshotShardUpdate (
@@ -792,29 +830,32 @@ private void sendSnapshotShardUpdate(
792830 final Snapshot snapshot ,
793831 final ShardId shardId ,
794832 final ShardSnapshotStatus status ,
795- Supplier < Void > postMasterNotificationAction
833+ Consumer < String > postMasterNotificationAction
796834 ) {
835+ snapshotShutdownProgressTracker .trackRequestSentToMaster (snapshot , shardId );
797836 ActionListener <Void > updateResultListener = new ActionListener <>() {
798837 @ Override
799838 public void onResponse (Void aVoid ) {
839+ snapshotShutdownProgressTracker .releaseRequestSentToMaster (snapshot , shardId );
840+ postMasterNotificationAction .accept (
841+ Strings .format ("successfully sent shard snapshot state [%s] update to the master node" , status .state ())
842+ );
800843 logger .trace ("[{}][{}] updated snapshot state to [{}]" , shardId , snapshot , status );
801844 }
802845
803846 @ Override
804847 public void onFailure (Exception e ) {
848+ snapshotShutdownProgressTracker .releaseRequestSentToMaster (snapshot , shardId );
849+ postMasterNotificationAction .accept (
850+ Strings .format ("exception trying to send shard snapshot state [%s] update to the master node [%s]" , status .state (), e )
851+ );
805852 logger .warn (() -> format ("[%s][%s] failed to update snapshot state to [%s]" , shardId , snapshot , status ), e );
806853 }
807854 };
808855
809- snapshotShutdownProgressTracker .trackRequestSentToMaster (snapshot , shardId );
810- var releaseTrackerRequestRunsBeforeResultListener = ActionListener .runBefore (updateResultListener , () -> {
811- snapshotShutdownProgressTracker .releaseRequestSentToMaster (snapshot , shardId );
812- postMasterNotificationAction .get ();
813- });
814-
815856 remoteFailedRequestDeduplicator .executeOnce (
816857 new UpdateIndexShardSnapshotStatusRequest (snapshot , shardId , status ),
817- releaseTrackerRequestRunsBeforeResultListener ,
858+ updateResultListener ,
818859 (req , reqListener ) -> transportService .sendRequest (
819860 transportService .getLocalNode (),
820861 SnapshotsService .UPDATE_SNAPSHOT_STATUS_ACTION_NAME ,
0 commit comments