@@ -52,7 +52,7 @@ func CheckAndCleanSnapshot(snapshotName string, indexName string, snapshots []op
5252 }
5353 if snapshot .State == "PARTIAL" || snapshot .State == "FAILED" {
5454 logger .Info (fmt .Sprintf ("Deleting PARTIAL/FAILED snapshot snapshot=%s state=%s" , snapshotName , snapshot .State ))
55- err := client . DeleteSnapshots ( snapRepo , []string {snapshotName })
55+ err := DeleteSnapshotsWithRetry ( client , snapRepo , []string {snapshotName }, logger )
5656 if err != nil {
5757 logger .Error (fmt .Sprintf ("Failed to delete PARTIAL/FAILED snapshot snapshot=%s error=%v" , snapshotName , err ))
5858 return false , err
@@ -352,7 +352,7 @@ retryLoop:
352352 duration := time .Since (startTime )
353353 durationStr := formatDuration (duration )
354354 logger .Warn (fmt .Sprintf ("Snapshot is PARTIAL/FAILED, deleting and retrying snapshot=%s state=%s duration=%s attempt=%d" , snapshotName , snapshot .State , durationStr , attempt ))
355- err := client . DeleteSnapshots ( snapRepo , []string {snapshotName })
355+ err := DeleteSnapshotsWithRetry ( client , snapRepo , []string {snapshotName }, logger )
356356 if err != nil {
357357 logger .Error (fmt .Sprintf ("Failed to delete PARTIAL/FAILED snapshot snapshot=%s error=%v" , snapshotName , err ))
358358 } else {
@@ -479,6 +479,60 @@ func BatchDeleteSnapshots(client *opensearch.Client, snapshots []string, snapRep
479479 return successful , failed , nil
480480}
481481
482+ func DeleteSnapshotsWithRetry (client * opensearch.Client , snapRepo string , snapshotNames []string , logger * logging.Logger ) error {
483+ const maxRetries = 15
484+
485+ if len (snapshotNames ) == 0 {
486+ return nil
487+ }
488+
489+ var lastErr error
490+ for attempt := 1 ; attempt <= maxRetries ; attempt ++ {
491+ existingSnapshots := make ([]string , 0 )
492+ for _ , snapshotName := range snapshotNames {
493+ snapshots , err := GetSnapshotsIgnore404 (client , snapRepo , snapshotName )
494+ if err != nil {
495+ logger .Warn (fmt .Sprintf ("Failed to check snapshot existence snapshot=%s error=%v, will try to delete" , snapshotName , err ))
496+ existingSnapshots = append (existingSnapshots , snapshotName )
497+ continue
498+ }
499+ if len (snapshots ) > 0 {
500+ existingSnapshots = append (existingSnapshots , snapshotName )
501+ } else {
502+ logger .Info (fmt .Sprintf ("Snapshot already deleted, skipping snapshot=%s" , snapshotName ))
503+ }
504+ }
505+
506+ if len (existingSnapshots ) == 0 {
507+ logger .Info (fmt .Sprintf ("All snapshots already deleted attempt=%d snapshots=%v" , attempt , snapshotNames ))
508+ return nil
509+ }
510+
511+ logger .Info (fmt .Sprintf ("Deleting snapshots attempt=%d maxRetries=%d snapshots=%v" , attempt , maxRetries , existingSnapshots ))
512+
513+ err := client .DeleteSnapshots (snapRepo , existingSnapshots )
514+ if err != nil {
515+ lastErr = err
516+ logger .Error (fmt .Sprintf ("Failed to delete snapshots attempt=%d snapshots=%v error=%v" , attempt , existingSnapshots , err ))
517+ if attempt < maxRetries {
518+ logger .Info (fmt .Sprintf ("Waiting 1 minute before retry attempt=%d" , attempt + 1 ))
519+ time .Sleep (1 * time .Minute )
520+ continue
521+ }
522+ } else {
523+ logger .Info (fmt .Sprintf ("Snapshots deleted successfully attempt=%d snapshots=%v" , attempt , existingSnapshots ))
524+ return nil
525+ }
526+ }
527+
528+ if lastErr != nil {
529+ logger .Error (fmt .Sprintf ("Failed to delete snapshots after all retries maxRetries=%d snapshots=%v error=%v" , maxRetries , snapshotNames , lastErr ))
530+ return lastErr
531+ }
532+
533+ return nil
534+ }
535+
482536type SnapshotGroup struct {
483537 SnapshotName string
484538 Indices []string
0 commit comments