mongodb-labs
diff --git a/‎internal/partitions/partitions.go‎
Lines changed: 5 additions & 5 deletions b/‎internal/partitions/partitions.go‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎internal/retry/error.go‎
Lines changed: 12 additions & 0 deletions b/‎internal/retry/error.go‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎internal/retry/retry.go‎
Lines changed: 82 additions & 23 deletions b/‎internal/retry/retry.go‎
Lines changed: 82 additions & 23 deletions
diff --git a/‎internal/retry/retry_info.go‎
Lines changed: 21 additions & 22 deletions b/‎internal/retry/retry_info.go‎
Lines changed: 21 additions & 22 deletions
@@ -324,7 +324,7 @@ func GetSizeAndDocumentCount(ctx context.Context, logger *logger.Logger, retryer
 		Capped bool  `bson:"capped"`
 	}{}
 
-	err := retryer.Run(ctx, logger, func(ctx context.Context, ri *retry.Info) error {
+	err := retryer.Run(ctx, logger, func(ctx context.Context, ri *retry.FuncInfo) error {
 		ri.Log(logger.Logger, "collStats", "source", srcDB.Name(), collName, "Retrieving collection size and document count.")
 		request := bson.D{
 			{"aggregate", collName},
@@ -395,7 +395,7 @@ func GetDocumentCountAfterFiltering(ctx context.Context, logger *logger.Logger,
 	}
 	pipeline = append(pipeline, bson.D{{"$count", "numFilteredDocs"}})
 
-	err := retryer.Run(ctx, logger, func(ctx context.Context, ri *retry.Info) error {
+	err := retryer.Run(ctx, logger, func(ctx context.Context, ri *retry.FuncInfo) error {
 		ri.Log(logger.Logger, "count", "source", srcDB.Name(), collName, "Counting filtered documents.")
 		request := bson.D{
 			{"aggregate", collName},
@@ -488,7 +488,7 @@ func getOuterIDBound(
 	}...)
 
 	// Get one document containing only the smallest or largest _id value in the collection.
-	err := retryer.Run(ctx, subLogger, func(ctx context.Context, ri *retry.Info) error {
+	err := retryer.Run(ctx, subLogger, func(ctx context.Context, ri *retry.FuncInfo) error {
 		ri.Log(subLogger.Logger, "aggregate", "source", srcDB.Name(), collName, fmt.Sprintf("getting %s _id partition bound", minOrMaxBound))
 		cursor, cmdErr :=
 			srcDB.RunCommandCursor(ctx, bson.D{
@@ -577,7 +577,7 @@ func getMidIDBounds(
 	// Get a cursor for the $sample and $bucketAuto aggregation.
 	var midIDBounds []interface{}
 	agRetryer := retryer.WithErrorCodes(util.SampleTooManyDuplicates)
-	err := agRetryer.Run(ctx, logger, func(ctx context.Context, ri *retry.Info) error {
+	err := agRetryer.Run(ctx, logger, func(ctx context.Context, ri *retry.FuncInfo) error {
 		ri.Log(logger.Logger, "aggregate", "source", srcDB.Name(), collName, "Retrieving mid _id partition bounds using $sample.")
 		cursor, cmdErr :=
 			srcDB.RunCommandCursor(ctx, bson.D{
@@ -613,7 +613,7 @@ func getMidIDBounds(
 
 			// Append the copied bound to the other mid _id bounds.
 			midIDBounds = append(midIDBounds, bound)
-			ri.IterationSuccess()
+			ri.NoteSuccess()
 		}
 
 		return cursor.Err()
 
@@ -25,3 +25,15 @@ func (rde RetryDurationLimitExceededErr) Error() string {
 func (rde RetryDurationLimitExceededErr) Unwrap() error {
 	return rde.lastErr
 }
+
+// errgroupErr is an internal error type that we return from errgroup
+// callbacks. It allows us to know (reliably) which error is the one
+// that triggers the errgroup's failure
+type errgroupErr struct {
+	funcNum         int
+	errFromCallback error
+}
+
+func (ege errgroupErr) Error() string {
+	return fmt.Sprintf("func %d failed: %v", ege.funcNum, ege.errFromCallback)
+}
@@ -2,17 +2,35 @@ package retry
 
 import (
 	"context"
+	"errors"
+	"fmt"
 	"math/rand"
 	"time"
 
 	"github.com/10gen/migration-verifier/internal/logger"
 	"github.com/10gen/migration-verifier/internal/util"
+	"github.com/samber/lo"
+	"golang.org/x/sync/errgroup"
 )
 
-type RetryCallback = func(context.Context, *Info) error
+type RetryCallback = func(context.Context, *FuncInfo) error
 
-// Run retries f() whenever a transient error happens, up to the retryer's
-// configured duration limit.
+// Run() runs each given callback in parallel. If none of them fail,
+// then no error is returned.
+//
+// If one of them fails, the other callbacks' contexts are canceled.
+// If the error is non-transient, it's returned. If the error is transient,
+// though, then the retryer reruns each callback.
+//
+// The retryer tracks the last time each callback either a) succeeded or b)
+// was canceled. Whenever a callback fails, the retryer checks how long it
+// has gone since a success/cancellation. If that time period exceeds the
+// retryer's duration limit, then the retry loop ends, and a
+// RetryDurationLimitExceededErr is returned.
+//
+// Note that, if a given callback runs multiple potentially-retryable requests,
+// each successful request should be noted in the callback's FuncInfo.
+// See that struct's documentation for more details.
 //
 // IMPORTANT: This function should generally NOT be used within a transaction
 // callback. It may be used within a transaction callback if and only if:
@@ -26,7 +44,7 @@ type RetryCallback = func(context.Context, *Info) error
 // This returns an error if the duration limit is reached, or if f() returns a
 // non-transient error.
 func (r *Retryer) Run(
-	ctx context.Context, logger *logger.Logger, f RetryCallback,
+	ctx context.Context, logger *logger.Logger, f ...RetryCallback,
 ) error {
 	return r.runRetryLoop(ctx, logger, f)
 }
@@ -35,29 +53,76 @@ func (r *Retryer) Run(
 func (r *Retryer) runRetryLoop(
 	ctx context.Context,
 	logger *logger.Logger,
-	f RetryCallback,
+	funcs []RetryCallback,
 ) error {
 	var err error
 
-	ri := &Info{
+	startTime := time.Now()
+
+	li := &LoopInfo{
 		durationLimit: r.retryLimit,
-		lastResetTime: time.Now(),
 	}
+	funcinfos := lo.RepeatBy(
+		len(funcs),
+		func(_ int) *FuncInfo {
+			return &FuncInfo{
+				lastResetTime: startTime,
+				loopInfo:      li,
+			}
+		},
+	)
 	sleepTime := minSleepTime
 
 	for {
-		err = f(ctx, ri)
+		eg, egCtx := errgroup.WithContext(ctx)
+		for i, curFunc := range funcs {
 
-		// If f() returned a transient error, sleep and increase the sleep
-		// time for the next retry, maxing out at the maxSleepTime.
+			eg.Go(func() error {
+				err := curFunc(egCtx, funcinfos[i])
+
+				if err != nil {
+					return errgroupErr{
+						funcNum:         i,
+						errFromCallback: err,
+					}
+				}
+
+				return nil
+			})
+		}
+		err = eg.Wait()
+
+		// No error? Success!
 		if err == nil {
 			return nil
 		}
 
-		if !r.shouldRetryWithSleep(logger, sleepTime, err) {
-			return err
+		// Let's get the actual error from the function.
+		groupErr := errgroupErr{}
+		if !errors.As(err, &groupErr) {
+			panic(fmt.Sprintf("Error should be a %T, not %T: %v", groupErr, err, err))
 		}
 
+		// Not a transient error? Fail immediately.
+		if !r.shouldRetryWithSleep(logger, sleepTime, groupErr.errFromCallback) {
+			return groupErr.errFromCallback
+		}
+
+		li.attemptNumber++
+
+		// Our error is transient. If we've exhausted the allowed time
+		// then fail.
+		failedFuncInfo := funcinfos[groupErr.funcNum]
+		if failedFuncInfo.GetDurationSoFar() > li.durationLimit {
+			return RetryDurationLimitExceededErr{
+				attempts: li.attemptNumber,
+				duration: failedFuncInfo.GetDurationSoFar(),
+				lastErr:  groupErr.errFromCallback,
+			}
+		}
+
+		// Sleep and increase the sleep time for the next retry,
+		// up to maxSleepTime.
 		select {
 		case <-ctx.Done():
 			logger.Error().Err(ctx.Err()).Msg("Context was canceled. Aborting retry loop.")
@@ -69,18 +134,12 @@ func (r *Retryer) runRetryLoop(
 			}
 		}
 
-		ri.attemptNumber++
-
-		if ri.shouldResetDuration {
-			ri.lastResetTime = time.Now()
-			ri.shouldResetDuration = false
-		}
+		now := time.Now()
 
-		if ri.GetDurationSoFar() > ri.durationLimit {
-			return RetryDurationLimitExceededErr{
-				attempts: ri.attemptNumber,
-				duration: ri.GetDurationSoFar(),
-				lastErr:  err,
+		// Set all of the funcs that did *not* fail as having just succeeded.
+		for i, curInfo := range funcinfos {
+			if i != groupErr.funcNum {
+				curInfo.lastResetTime = now
 			}
 		}
 	}
 
@@ -7,19 +7,20 @@ import (
 	"github.com/rs/zerolog"
 )
 
-// Info stores information relevant to the retrying done. It should
+// LoopInfo stores information relevant to the retrying done. It should
 // primarily be used within the closure passed to the retry helpers.
 //
 // The attempt number is 0-indexed (0 means this is the first attempt).
 // The duration tracks the duration of retrying for transient errors only.
-type Info struct {
+type LoopInfo struct {
 	attemptNumber int
-
-	lastResetTime time.Time
 	durationLimit time.Duration
+}
 
-	// Used to reset the time elapsed for long running operations.
-	shouldResetDuration bool
+type FuncInfo struct {
+	loopInfo *LoopInfo
+
+	lastResetTime time.Time
 }
 
 // Log will log a debug-level message for the current Info values and the provided strings.
@@ -30,7 +31,7 @@ type Info struct {
 //
 // Useful for keeping track of DDL commands that access/change the cluster in some way.
 // Generally not recommended for CRUD commands, which may result in too many log lines.
-func (ri *Info) Log(logger *zerolog.Logger, cmdName string, clientType string, database string, collection string, msg string) {
+func (fi *FuncInfo) Log(logger *zerolog.Logger, cmdName string, clientType string, database string, collection string, msg string) {
 	// Don't log if no logger is provided. Mostly useful for
 	// integration tests where we don't want additional logs.
 	if logger == nil {
@@ -51,31 +52,29 @@ func (ri *Info) Log(logger *zerolog.Logger, cmdName string, clientType string, d
 		event.Str("collection", collection)
 	}
 	event.Str("context", msg).
-		Int("attemptNumber", ri.attemptNumber).
-		Str("durationSoFar", reportutils.DurationToHMS(ri.GetDurationSoFar())).
-		Str("durationLimit", reportutils.DurationToHMS(ri.durationLimit)).
+		Int("attemptNumber", fi.GetAttemptNumber()).
+		Str("durationSoFar", reportutils.DurationToHMS(fi.GetDurationSoFar())).
+		Str("durationLimit", reportutils.DurationToHMS(fi.loopInfo.durationLimit)).
 		Msg("Running retryable function")
 }
 
 // GetAttemptNumber returns the Info's current attempt number (0-indexed).
-func (ri *Info) GetAttemptNumber() int {
-	return ri.attemptNumber
+func (fi *FuncInfo) GetAttemptNumber() int {
+	return fi.loopInfo.attemptNumber
 }
 
 // GetDurationSoFar returns the Info's current duration so far. This duration
 // applies to the duration of retrying for transient errors only.
-func (ri *Info) GetDurationSoFar() time.Duration {
-	return time.Since(ri.lastResetTime)
+func (fi *FuncInfo) GetDurationSoFar() time.Duration {
+	return time.Since(fi.lastResetTime)
 }
 
-// IterationSuccess is used to tell the retry util to reset its measurement
+// NoteSuccess is used to tell the retry util to reset its measurement
 // of how long the closure has been running for. This is useful for long
 // running operations that might run successfully for a few days and then fail.
-// Essentially, calling this function tells the retry util not to include the
-// closure's run time as a part of the overall measurement of how long the
-// closure took including retries, since that measurement is used to determine
-// whether we want to retry the operation or not. (If the measurement is greater
-// than the retry time, we will not retry.)
-func (ri *Info) IterationSuccess() {
-	ri.shouldResetDuration = true
+//
+// Call this after every successful command in a multi-command callback.
+// (It’s useless--but harmless--in a single-command callback.)
+func (i *FuncInfo) NoteSuccess() {
+	i.lastResetTime = time.Now()
 }