Skip to content

Commit e1507bd

Browse files
committed
squashed
1 parent a1945d7 commit e1507bd

File tree

11 files changed

+399
-75
lines changed

11 files changed

+399
-75
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/10gen/migration-verifier
22

3-
go 1.20
3+
go 1.22
44

55
require (
66
github.com/cespare/permute/v2 v2.0.0-beta2

internal/retry/retry.go

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package retry
33
import (
44
"context"
55
"math/rand"
6+
"slices"
67
"time"
78

89
"github.com/10gen/migration-verifier/internal/logger"
@@ -11,6 +12,8 @@ import (
1112
"go.mongodb.org/mongo-driver/bson"
1213
)
1314

15+
var CustomTransientErr = errors.New("possibly-transient failure detected")
16+
1417
// RunForUUIDAndTransientErrors retries f() for the CollectionUUIDMismatch error and for transient errors.
1518
// This should be used to run a driver operation that optionally specifies the `collectionUUID` parameter
1619
// for a collection that may have been:
@@ -273,25 +276,42 @@ func (r *Retryer) shouldRetryWithSleep(
273276
return false
274277
}
275278

279+
var whyTransient string
280+
276281
errCode := util.GetErrorCode(err)
277-
if util.IsTransientError(err) {
278-
logger.Warn().Int("error code", errCode).Err(err).Msgf(
279-
"Waiting %s seconds to retry operation after transient error.", sleepTime)
280-
return true
282+
283+
if util.IsTransientError(err) || slices.Contains(r.additionalErrorCodes, errCode) {
284+
whyTransient = "Error code suggests a transient error."
285+
} else if errors.Is(err, CustomTransientErr) {
286+
whyTransient = "Error may be transient."
281287
}
282288

283-
for _, code := range r.additionalErrorCodes {
284-
if code == errCode {
285-
logger.Warn().Int("error code", errCode).Err(err).Msgf(
286-
"Waiting %s seconds to retry operation after an error because it is in our additional codes list.", sleepTime)
287-
return true
289+
if whyTransient == "" {
290+
event := logger.Debug().Err(err)
291+
292+
if errCode != 0 {
293+
event = event.Int("error code", errCode)
288294
}
295+
296+
event.Msg("Not retrying on error because it appears not to be transient.")
297+
298+
return false
299+
}
300+
301+
retryLogEvent := logger.Warn().
302+
Err(err).
303+
Stringer("duration", sleepTime)
304+
305+
if errCode != 0 {
306+
retryLogEvent = retryLogEvent.
307+
Int("error code", errCode)
289308
}
290309

291-
logger.Debug().Err(err).Int("error code", errCode).
292-
Msg("Not retrying on error because it is not transient nor is it in our additional codes list.")
310+
retryLogEvent.
311+
Str("reason", whyTransient).
312+
Msg("Pausing then will retry operation.")
293313

294-
return false
314+
return true
295315
}
296316

297317
// Use this method for aggregates which should take a UUID in new versions but not old ones.

internal/verifier/change_stream.go

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"go.mongodb.org/mongo-driver/bson/primitive"
1313
"go.mongodb.org/mongo-driver/mongo"
1414
"go.mongodb.org/mongo-driver/mongo/options"
15-
"golang.org/x/exp/constraints"
1615
)
1716

1817
const fauxDocSizeForDeleteEvents = 1024
@@ -138,12 +137,14 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
138137
return err
139138
}
140139

141-
readAndHandleOneChangeEventBatch := func() (bool, error) {
140+
readAndHandleOneChangeEventBatch := func() error {
142141
eventsRead := 0
143142
var changeEventBatch []ParsedEvent
144143

145144
for hasEventInBatch := true; hasEventInBatch; hasEventInBatch = cs.RemainingBatchLength() > 0 {
145+
verifier.logger.Info().Msg("reading event")
146146
gotEvent := cs.TryNext(ctx)
147+
verifier.logger.Info().Err(cs.Err()).Msgf("got event? %v", gotEvent)
147148

148149
if !gotEvent || cs.Err() != nil {
149150
break
@@ -154,21 +155,29 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
154155
}
155156

156157
if err := cs.Decode(&changeEventBatch[eventsRead]); err != nil {
157-
return false, errors.Wrap(err, "failed to decode change event")
158+
return errors.Wrap(err, "failed to decode change event")
158159
}
159160

161+
verifier.logger.Debug().Interface("event", changeEventBatch[eventsRead]).Msg("Got event.")
162+
160163
eventsRead++
161164
}
162165

163-
if eventsRead > 0 {
164-
verifier.logger.Debug().Int("eventsCount", eventsRead).Msgf("Received a batch of events.")
165-
err := verifier.HandleChangeStreamEvents(ctx, changeEventBatch)
166-
if err != nil {
167-
return false, errors.Wrap(err, "failed to handle change events")
168-
}
166+
if cs.Err() != nil {
167+
return errors.Wrap(cs.Err(), "change stream iteration failed")
168+
}
169+
170+
if eventsRead == 0 {
171+
return nil
169172
}
170173

171-
return eventsRead > 0, errors.Wrap(cs.Err(), "change stream iteration failed")
174+
verifier.logger.Debug().Int("eventsCount", eventsRead).Msgf("Received a batch of events.")
175+
err := verifier.HandleChangeStreamEvents(ctx, changeEventBatch)
176+
if err != nil {
177+
return errors.Wrap(err, "failed to handle change events")
178+
}
179+
180+
return nil
172181
}
173182

174183
for {
@@ -184,29 +193,45 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
184193
// If the changeStreamEnderChan has a message, the user has indicated that
185194
// source writes are ended. This means we should exit rather than continue
186195
// reading the change stream since there should be no more events.
187-
case <-verifier.changeStreamEnderChan:
196+
case finalTs := <-verifier.changeStreamFinalTsChan:
188197
verifier.logger.Debug().
189-
Msg("Change stream thread received shutdown request.")
198+
Interface("finalTimestamp", finalTs).
199+
Msg("Change stream thread received final timestamp. Finalizing change stream.")
190200

191201
changeStreamEnded = true
192202

193203
// Read all change events until the source reports no events.
194204
// (i.e., the `getMore` call returns empty)
195205
for {
196-
var gotEvent bool
197-
gotEvent, err = readAndHandleOneChangeEventBatch()
206+
var curTs primitive.Timestamp
207+
curTs, err = extractTimestampFromResumeToken(cs.ResumeToken())
208+
if err != nil {
209+
err = errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
210+
break
211+
}
212+
213+
if curTs.Compare(finalTs) >= 0 {
214+
verifier.logger.Debug().
215+
Interface("currentTimestamp", curTs).
216+
Interface("finalTimestamp", finalTs).
217+
Msg("Change stream has reached the final timestamp. Shutting down.")
198218

199-
if !gotEvent || err != nil {
219+
break
220+
}
221+
222+
err = readAndHandleOneChangeEventBatch()
223+
224+
if err != nil {
200225
break
201226
}
202227
}
203228

204229
default:
205-
_, err = readAndHandleOneChangeEventBatch()
206-
}
230+
err = readAndHandleOneChangeEventBatch()
207231

208-
if err == nil {
209-
err = persistResumeTokenIfNeeded()
232+
if err == nil {
233+
err = persistResumeTokenIfNeeded()
234+
}
210235
}
211236

212237
if err != nil && !errors.Is(err, context.Canceled) {
@@ -237,9 +262,9 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
237262

238263
infoLog := verifier.logger.Info()
239264
if verifier.lastChangeEventTime == nil {
240-
infoLog = infoLog.Str("changeStreamStopTime", "none")
265+
infoLog = infoLog.Str("lastEventTime", "none")
241266
} else {
242-
infoLog = infoLog.Interface("changeStreamStopTime", *verifier.lastChangeEventTime)
267+
infoLog = infoLog.Interface("lastEventTime", *verifier.lastChangeEventTime)
243268
}
244269

245270
infoLog.Msg("Change stream is done.")
@@ -265,7 +290,7 @@ func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
265290

266291
ts, err := extractTimestampFromResumeToken(savedResumeToken)
267292
if err == nil {
268-
logEvent = addUnixTimeToLogEvent(ts.T, logEvent)
293+
logEvent = addTimestampToLogEvent(ts, logEvent)
269294
} else {
270295
verifier.logger.Warn().
271296
Err(err).
@@ -284,6 +309,7 @@ func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
284309
return errors.Wrap(err, "failed to start session")
285310
}
286311
sctx := mongo.NewSessionContext(ctx, sess)
312+
verifier.logger.Debug().Interface("pipeline", pipeline).Msg("Opened change stream.")
287313
srcChangeStream, err := verifier.srcClient.Watch(sctx, pipeline, opts)
288314
if err != nil {
289315
return errors.Wrap(err, "failed to open change stream")
@@ -318,8 +344,10 @@ func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
318344
return nil
319345
}
320346

321-
func addUnixTimeToLogEvent[T constraints.Integer](unixTime T, event *zerolog.Event) *zerolog.Event {
322-
return event.Time("clockTime", time.Unix(int64(unixTime), int64(0)))
347+
func addTimestampToLogEvent(ts primitive.Timestamp, event *zerolog.Event) *zerolog.Event {
348+
return event.
349+
Interface("timestamp", ts).
350+
Time("time", time.Unix(int64(ts.T), int64(0)))
323351
}
324352

325353
func (v *Verifier) getChangeStreamMetadataCollection() *mongo.Collection {
@@ -358,7 +386,7 @@ func (verifier *Verifier) persistChangeStreamResumeToken(ctx context.Context, cs
358386
logEvent := verifier.logger.Debug()
359387

360388
if err == nil {
361-
logEvent = addUnixTimeToLogEvent(ts.T, logEvent)
389+
logEvent = addTimestampToLogEvent(ts, logEvent)
362390
} else {
363391
verifier.logger.Warn().Err(err).
364392
Msg("failed to extract resume token timestamp")

internal/verifier/check.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,24 @@ func (verifier *Verifier) Check(ctx context.Context, filter map[string]any) {
4242
verifier.MaybeStartPeriodicHeapProfileCollection(ctx)
4343
}
4444

45-
func (verifier *Verifier) waitForChangeStream() error {
45+
func (verifier *Verifier) waitForChangeStream(ctx context.Context) error {
4646
verifier.mux.RLock()
4747
csRunning := verifier.changeStreamRunning
4848
verifier.mux.RUnlock()
4949
if csRunning {
5050
verifier.logger.Debug().Msg("Changestream still running, signalling that writes are done and waiting for change stream to exit")
51-
verifier.changeStreamEnderChan <- struct{}{}
51+
52+
finalTs, err := GetClusterTime(
53+
ctx,
54+
verifier.logger,
55+
verifier.srcClient,
56+
)
57+
58+
if err != nil {
59+
return errors.Wrapf(err, "failed to fetch source's cluster time")
60+
}
61+
62+
verifier.changeStreamFinalTsChan <- finalTs
5263
select {
5364
case err := <-verifier.changeStreamErrChan:
5465
verifier.logger.Warn().Err(err).
@@ -242,7 +253,7 @@ func (verifier *Verifier) CheckDriver(ctx context.Context, filter map[string]any
242253
// It's necessary to wait for the change stream to finish before incrementing the
243254
// generation number, or the last changes will not be checked.
244255
verifier.mux.Unlock()
245-
err := verifier.waitForChangeStream()
256+
err := verifier.waitForChangeStream(ctx)
246257
if err != nil {
247258
return err
248259
}

0 commit comments

Comments
 (0)