@@ -594,6 +594,39 @@ func (n nodeError) Error() string {
594
594
return fmt .Sprintf ("%s failed: %s" , n .node , n .msg )
595
595
}
596
596
597
+ // checkForRSLevelErr checks if all nodes have an error,
598
+ // and in that case true is returned.
599
+ // If any node doesn't have error, false is returned.
600
+ func (r * PhysRestore ) checkForRSLevelErr () bool {
601
+ for f := range r .syncPathPeers {
602
+ errFile := f + "." + string (defs .StatusError )
603
+ _ , err := r .stg .FileStat (errFile )
604
+ if errors .Is (err , storage .ErrNotExist ) {
605
+ return false
606
+ }
607
+ if err != nil {
608
+ r .log .Error ("error while checking file %s: %v" , errFile , err )
609
+ }
610
+ // error file is found
611
+ }
612
+ return true
613
+ }
614
+
615
+ // checkForClusterLevelErr checks if any RS (shard) has an error.
616
+ // It returns true if at least one RS has error, otherwise false.
617
+ func (r * PhysRestore ) checkForClusterLevelErr () bool {
618
+ for f := range r .syncPathShards {
619
+ errFile := f + "." + string (defs .StatusError )
620
+ _ , err := r .stg .FileStat (errFile )
621
+ if err == nil {
622
+ return true
623
+ } else if ! errors .Is (err , storage .ErrNotExist ) {
624
+ r .log .Error ("error while checking file %s: %v" , errFile , err )
625
+ }
626
+ }
627
+ return false
628
+ }
629
+
597
630
func (r * PhysRestore ) waitFiles (
598
631
status defs.Status ,
599
632
objs map [string ]struct {},
@@ -816,7 +849,7 @@ func (r *PhysRestore) Snapshot(
816
849
// set failed status of node on error, but
817
850
// don't mark node as failed after the local restore succeed
818
851
if err != nil && ! progress .is (restoreDone ) && ! errors .Is (err , ErrNoDataForShard ) {
819
- r .MarkFailed (meta , err , ! progress . is ( restoreStared ) )
852
+ r .MarkFailed (meta , err )
820
853
}
821
854
822
855
r .close (err == nil , progress .is (restoreStared ) && ! progress .is (restoreDone ))
@@ -2367,7 +2400,7 @@ func (r *PhysRestore) checkMongod(needVersion string) (version string, err error
2367
2400
}
2368
2401
2369
2402
// MarkFailed sets the restore and rs state as failed with the given message
2370
- func (r * PhysRestore ) MarkFailed (meta * RestoreMeta , e error , markCluster bool ) {
2403
+ func (r * PhysRestore ) MarkFailed (meta * RestoreMeta , e error ) {
2371
2404
var nerr nodeError
2372
2405
if errors .As (e , & nerr ) {
2373
2406
e = nerr
@@ -2390,14 +2423,14 @@ func (r *PhysRestore) MarkFailed(meta *RestoreMeta, e error, markCluster bool) {
2390
2423
// At some point, every node will try to set an rs and cluster state
2391
2424
// (in `toState` method).
2392
2425
// Here we are not aware of partlyDone etc so leave it to the `toState`.
2393
- if r .nodeInfo . IsPrimary && markCluster {
2426
+ if r .checkForRSLevelErr () {
2394
2427
serr := util .RetryableWrite (r .stg ,
2395
2428
r .syncPathRS + "." + string (defs .StatusError ), errStatus (e ))
2396
2429
if serr != nil {
2397
2430
r .log .Error ("MarkFailed: write replset error state `%v`: %v" , e , serr )
2398
2431
}
2399
2432
}
2400
- if r .nodeInfo .IsClusterLeader () && markCluster {
2433
+ if r .nodeInfo .IsLeader () && r . checkForClusterLevelErr () {
2401
2434
serr := util .RetryableWrite (r .stg ,
2402
2435
r .syncPathCluster + "." + string (defs .StatusError ), errStatus (e ))
2403
2436
if serr != nil {
0 commit comments