@@ -2,17 +2,35 @@ package retry
22
33import (
44 "context"
5+ "errors"
6+ "fmt"
57 "math/rand"
68 "time"
79
810 "github.com/10gen/migration-verifier/internal/logger"
911 "github.com/10gen/migration-verifier/internal/util"
12+ "github.com/samber/lo"
13+ "golang.org/x/sync/errgroup"
1014)
1115
12- type RetryCallback = func (context.Context , * Info ) error
16+ type RetryCallback = func (context.Context , * FuncInfo ) error
1317
14- // Run retries f() whenever a transient error happens, up to the retryer's
15- // configured duration limit.
18+ // Run() runs each given callback in parallel. If none of them fail,
19+ // then no error is returned.
20+ //
21+ // If one of them fails, the other callbacks' contexts are canceled.
22+ // If the error is non-transient, it's returned. If the error is transient,
23+ // though, then the retryer reruns each callback.
24+ //
25+ // The retryer tracks the last time each callback either a) succeeded or b)
26+ // was canceled. Whenever a callback fails, the retryer checks how long it
27+ // has gone since a success/cancellation. If that time period exceeds the
28+ // retryer's duration limit, then the retry loop ends, and a
29+ // RetryDurationLimitExceededErr is returned.
30+ //
31+ // Note that, if a given callback runs multiple potentially-retryable requests,
32+ // each successful request should be noted in the callback's FuncInfo.
33+ // See that struct's documentation for more details.
1634//
1735// IMPORTANT: This function should generally NOT be used within a transaction
1836// callback. It may be used within a transaction callback if and only if:
@@ -26,7 +44,7 @@ type RetryCallback = func(context.Context, *Info) error
2644// This returns an error if the duration limit is reached, or if f() returns a
2745// non-transient error.
2846func (r * Retryer ) Run (
29- ctx context.Context , logger * logger.Logger , f RetryCallback ,
47+ ctx context.Context , logger * logger.Logger , f ... RetryCallback ,
3048) error {
3149 return r .runRetryLoop (ctx , logger , f )
3250}
@@ -35,29 +53,76 @@ func (r *Retryer) Run(
3553func (r * Retryer ) runRetryLoop (
3654 ctx context.Context ,
3755 logger * logger.Logger ,
38- f RetryCallback ,
56+ funcs [] RetryCallback ,
3957) error {
4058 var err error
4159
42- ri := & Info {
60+ startTime := time .Now ()
61+
62+ li := & LoopInfo {
4363 durationLimit : r .retryLimit ,
44- lastResetTime : time .Now (),
4564 }
65+ funcinfos := lo .RepeatBy (
66+ len (funcs ),
67+ func (_ int ) * FuncInfo {
68+ return & FuncInfo {
69+ lastResetTime : startTime ,
70+ loopInfo : li ,
71+ }
72+ },
73+ )
4674 sleepTime := minSleepTime
4775
4876 for {
49- err = f (ctx , ri )
77+ eg , egCtx := errgroup .WithContext (ctx )
78+ for i , curFunc := range funcs {
5079
51- // If f() returned a transient error, sleep and increase the sleep
52- // time for the next retry, maxing out at the maxSleepTime.
80+ eg .Go (func () error {
81+ err := curFunc (egCtx , funcinfos [i ])
82+
83+ if err != nil {
84+ return errgroupErr {
85+ funcNum : i ,
86+ errFromCallback : err ,
87+ }
88+ }
89+
90+ return nil
91+ })
92+ }
93+ err = eg .Wait ()
94+
95+ // No error? Success!
5396 if err == nil {
5497 return nil
5598 }
5699
57- if ! r .shouldRetryWithSleep (logger , sleepTime , err ) {
58- return err
100+ // Let's get the actual error from the function.
101+ groupErr := errgroupErr {}
102+ if ! errors .As (err , & groupErr ) {
103+ panic (fmt .Sprintf ("Error should be a %T, not %T: %v" , groupErr , err , err ))
59104 }
60105
106+ // Not a transient error? Fail immediately.
107+ if ! r .shouldRetryWithSleep (logger , sleepTime , groupErr .errFromCallback ) {
108+ return groupErr .errFromCallback
109+ }
110+
111+ li .attemptNumber ++
112+
113+ // Our error is transient. If we've exhausted the allowed time
114+ // then fail.
115+ failedFuncInfo := funcinfos [groupErr .funcNum ]
116+ if failedFuncInfo .GetDurationSoFar () > li .durationLimit {
117+ return RetryDurationLimitExceededErr {
118+ attempts : li .attemptNumber ,
119+ duration : failedFuncInfo .GetDurationSoFar (),
120+ lastErr : groupErr .errFromCallback ,
121+ }
122+ }
123+
124+ // Sleep and increase the sleep time for the next retry,
125+ // up to maxSleepTime.
61126 select {
62127 case <- ctx .Done ():
63128 logger .Error ().Err (ctx .Err ()).Msg ("Context was canceled. Aborting retry loop." )
@@ -69,18 +134,12 @@ func (r *Retryer) runRetryLoop(
69134 }
70135 }
71136
72- ri .attemptNumber ++
73-
74- if ri .shouldResetDuration {
75- ri .lastResetTime = time .Now ()
76- ri .shouldResetDuration = false
77- }
137+ now := time .Now ()
78138
79- if ri .GetDurationSoFar () > ri .durationLimit {
80- return RetryDurationLimitExceededErr {
81- attempts : ri .attemptNumber ,
82- duration : ri .GetDurationSoFar (),
83- lastErr : err ,
139+ // Set all of the funcs that did *not* fail as having just succeeded.
140+ for i , curInfo := range funcinfos {
141+ if i != groupErr .funcNum {
142+ curInfo .lastResetTime = now
84143 }
85144 }
86145 }
0 commit comments