Skip to content

Commit 003e9ea

Browse files
committed
retry on change stream failure
1 parent a85f5a6 commit 003e9ea

29 files changed

+3379
-35
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ require (
4242
github.com/pelletier/go-toml/v2 v2.0.1 // indirect
4343
github.com/pmezard/go-difflib v1.0.0 // indirect
4444
github.com/russross/blackfriday/v2 v2.1.0 // indirect
45+
github.com/samber/mo v1.13.0 // indirect
4546
github.com/ugorji/go/codec v1.2.7 // indirect
4647
github.com/xdg-go/pbkdf2 v1.0.0 // indirect
4748
github.com/xdg-go/scram v1.1.2 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf
8484
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
8585
github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc=
8686
github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU=
87+
github.com/samber/mo v1.13.0 h1:LB1OwfJMju3a6FjghH+AIvzMG0ZPOzgTWj1qaHs1IQ4=
88+
github.com/samber/mo v1.13.0/go.mod h1:BfkrCPuYzVG3ZljnZB783WIJIGk1mcZr9c9CPf8tAxs=
8789
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
8890
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
8991
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=

internal/util/error.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ import (
2020
// `ErrorCode` newtype, but that requires a more invasive change to everything
2121
// that uses error codes.
2222
const (
23-
LockFailed int = 107
24-
SampleTooManyDuplicates int = 28799
23+
LockFailed = 107
24+
SampleTooManyDuplicates = 28799
25+
CursorKilled = 237
2526
)
2627

2728
//

internal/verifier/change_stream.go

Lines changed: 78 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ import (
66
"time"
77

88
"github.com/10gen/migration-verifier/internal/keystring"
9+
"github.com/10gen/migration-verifier/internal/retry"
10+
"github.com/10gen/migration-verifier/internal/util"
911
"github.com/pkg/errors"
1012
"github.com/rs/zerolog"
13+
"github.com/samber/mo"
1114
"go.mongodb.org/mongo-driver/bson"
1215
"go.mongodb.org/mongo-driver/bson/primitive"
1316
"go.mongodb.org/mongo-driver/mongo"
@@ -175,7 +178,7 @@ func (verifier *Verifier) readAndHandleOneChangeEventBatch(
175178
return nil
176179
}
177180

178-
func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.ChangeStream) {
181+
func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.ChangeStream) error {
179182
defer cs.Close(ctx)
180183

181184
var lastPersistedTime time.Time
@@ -201,10 +204,7 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
201204

202205
// If the context is canceled, return immmediately.
203206
case <-ctx.Done():
204-
verifier.logger.Debug().
205-
Err(ctx.Err()).
206-
Msg("Change stream quitting.")
207-
return
207+
return ctx.Err()
208208

209209
// If the changeStreamEnderChan has a message, the user has indicated that
210210
// source writes are ended. This means we should exit rather than continue
@@ -222,8 +222,7 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
222222
var curTs primitive.Timestamp
223223
curTs, err = extractTimestampFromResumeToken(cs.ResumeToken())
224224
if err != nil {
225-
err = errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
226-
break
225+
return errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
227226
}
228227

229228
if curTs == writesOffTs || curTs.After(writesOffTs) {
@@ -238,7 +237,7 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
238237
err = verifier.readAndHandleOneChangeEventBatch(ctx, cs)
239238

240239
if err != nil {
241-
break
240+
return err
242241
}
243242
}
244243

@@ -248,17 +247,9 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
248247
if err == nil {
249248
err = persistResumeTokenIfNeeded()
250249
}
251-
}
252-
253-
if err != nil && !errors.Is(err, context.Canceled) {
254-
verifier.logger.Debug().
255-
Err(err).
256-
Msg("Sending change stream error.")
257-
258-
verifier.changeStreamErrChan <- err
259250

260-
if !gotwritesOffTimestamp {
261-
break
251+
if err != nil {
252+
return err
262253
}
263254
}
264255

@@ -284,18 +275,21 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
284275
}
285276

286277
infoLog.Msg("Change stream is done.")
278+
279+
return nil
287280
}
288281

289-
// StartChangeStream starts the change stream.
290-
func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
282+
func (verifier *Verifier) createChangeStream(
283+
ctx context.Context,
284+
) (*mongo.ChangeStream, primitive.Timestamp, error) {
291285
pipeline := verifier.GetChangeStreamFilter()
292286
opts := options.ChangeStream().
293287
SetMaxAwaitTime(1 * time.Second).
294288
SetFullDocument(options.UpdateLookup)
295289

296290
savedResumeToken, err := verifier.loadChangeStreamResumeToken(ctx)
297291
if err != nil {
298-
return errors.Wrap(err, "failed to load persisted change stream resume token")
292+
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to load persisted change stream resume token")
299293
}
300294

301295
csStartLogEvent := verifier.logger.Info()
@@ -322,40 +316,92 @@ func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
322316

323317
sess, err := verifier.srcClient.StartSession()
324318
if err != nil {
325-
return errors.Wrap(err, "failed to start session")
319+
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to start session")
326320
}
327321
sctx := mongo.NewSessionContext(ctx, sess)
328322
srcChangeStream, err := verifier.srcClient.Watch(sctx, pipeline, opts)
329323
if err != nil {
330-
return errors.Wrap(err, "failed to open change stream")
324+
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to open change stream")
331325
}
332326

333327
err = verifier.persistChangeStreamResumeToken(ctx, srcChangeStream)
334328
if err != nil {
335-
return err
329+
return nil, primitive.Timestamp{}, err
336330
}
337331

338-
csTimestamp, err := extractTimestampFromResumeToken(srcChangeStream.ResumeToken())
332+
startTs, err := extractTimestampFromResumeToken(srcChangeStream.ResumeToken())
339333
if err != nil {
340-
return errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
334+
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
341335
}
342336

337+
// With sharded clusters the resume token might lead the cluster time
338+
// by 1 increment. In that case we need the actual cluster time;
339+
// otherwise we will get errors.
343340
clusterTime, err := getClusterTimeFromSession(sess)
344341
if err != nil {
345-
return errors.Wrap(err, "failed to read cluster time from session")
342+
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to read cluster time from session")
346343
}
347344

348-
verifier.srcStartAtTs = &csTimestamp
349-
if csTimestamp.After(clusterTime) {
350-
verifier.srcStartAtTs = &clusterTime
345+
if startTs.After(clusterTime) {
346+
startTs = clusterTime
351347
}
352348

349+
return srcChangeStream, startTs, nil
350+
}
351+
352+
// StartChangeStream starts the change stream.
353+
func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
354+
resultChan := make(chan mo.Result[primitive.Timestamp])
355+
356+
go func() {
357+
retryer := retry.New(retry.DefaultDurationLimit)
358+
retryer = retryer.WithErrorCodes(util.CursorKilled)
359+
360+
parentThreadWaiting := true
361+
362+
err := retryer.
363+
RunForTransientErrorsOnly(
364+
ctx,
365+
verifier.logger,
366+
func(i *retry.Info) error {
367+
srcChangeStream, startTs, err := verifier.createChangeStream(ctx)
368+
if err != nil {
369+
return err
370+
}
371+
372+
if parentThreadWaiting {
373+
resultChan <- mo.Ok(startTs)
374+
close(resultChan)
375+
parentThreadWaiting = false
376+
}
377+
378+
return verifier.iterateChangeStream(ctx, srcChangeStream)
379+
},
380+
)
381+
382+
if err != nil {
383+
if parentThreadWaiting {
384+
resultChan <- mo.Err[primitive.Timestamp](err)
385+
} else {
386+
verifier.changeStreamErrChan <- err
387+
close(verifier.changeStreamErrChan)
388+
}
389+
}
390+
}()
391+
392+
result := <-resultChan
393+
394+
startTs, err := result.Get()
395+
if err != nil {
396+
return err
397+
}
398+
399+
verifier.srcStartAtTs = &startTs
400+
353401
verifier.mux.Lock()
354402
verifier.changeStreamRunning = true
355403
verifier.mux.Unlock()
356404

357-
go verifier.iterateChangeStream(ctx, srcChangeStream)
358-
359405
return nil
360406
}
361407

internal/verifier/check.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func (verifier *Verifier) CheckWorker(ctx context.Context) error {
8787
select {
8888
case err := <-verifier.changeStreamErrChan:
8989
cancel()
90-
return err
90+
return errors.Wrap(err, "change stream failed")
9191
case <-ctx.Done():
9292
cancel()
9393
return nil

vendor/github.com/samber/mo/.gitignore

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/samber/mo/Dockerfile

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/samber/mo/LICENSE

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/samber/mo/Makefile

Lines changed: 44 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)