Skip to content

Commit d7b456e

Browse files
authored
REP-6088 Tolerate high numbers of mismatches (#117)
Previously all document mismatches were recorded directly in the verification task. This meant, though, that if a task encompassed a large number of mismatched or missing documents, the verifier could fail to persist all of the mismatches, which caused a crash. (The usual cause of excess mismatched/missing documents is starting migration-verifier before initial sync finishes, but it can also reasonably happen without REP-6129’s fix for queries against pre-v5 servers. See HELP-75910.) This changeset makes the verifier save mismatches to a dedicated collection instead, one document per mismatch. This change upends some familiar workflows for investigating mismatches: it’s no longer sufficient just to query the `verification_tasks` collection for mismatch information since the actual mismatches are recorded in a separate collection. To address this, the documentation now gives an aggregation pipeline that yields a similarly-useful result. This entails a metadata version change. Because that’s happening, this also changes the task type `verify` to `verifyDocuments`. (That required some sorting workarounds in tests, which were tight-coupled to the task type strings.)
1 parent bca252e commit d7b456e

File tree

15 files changed

+826
-299
lines changed

15 files changed

+826
-299
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,31 @@ The verifier will now check to completion to make sure that there are no inconsi
142142
| `--ignoreReadConcern` | Use connection-default read concerns rather than setting majority read concern. This option may degrade consistency, so only enable it if majority read concern (the default) doesn’t work. |
143143
| `--help`, `-h` | show help |
144144
145+
# Investigation of Mismatches
146+
147+
The verifier records any mismatches it finds in its metadata’s `mismatches`
148+
collection. Mismatches are indexed by verification task ID. To find a given
149+
generation’s mismatches, aggregate like this on the metadata cluster:
150+
151+
// Change this as needed if you specify a custom metadata database:
152+
use migration_verification_metadata
153+
154+
db.verification_tasks.aggregate(
155+
{ $match: {
156+
generation: <whichever generation>,
157+
status: "failed",
158+
} },
159+
{ $lookup: {
160+
from: "mismatches",
161+
localField: "_id",
162+
foreignField: "task",
163+
as: "mismatch",
164+
}},
165+
{ $unwind: "$mismatch" },
166+
)
167+
168+
Note that each mismatch includes timestamps. You can cross-reference
169+
these with the clusters’ oplogs to diagnose problems.
145170
146171
# Benchmarking Results
147172

chanutil/io.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package chanutil
2+
3+
import (
4+
"context"
5+
6+
"github.com/10gen/migration-verifier/internal/util"
7+
"github.com/10gen/migration-verifier/option"
8+
)
9+
10+
// ReadWithDoneCheck takes a context and a channel to read from. It will read
11+
// from `ctx.Done()` and the given channel in a select. If it reads an error
12+
// from the `ctx.Done()` channel, it returns the value of `Err(ctx)`, which
13+
// includes the cancellation cause. If it reads a value from that channel, it
14+
// returns the value. If the channel was closed, the return value is None.
15+
func ReadWithDoneCheck[T any](ctx context.Context, ch <-chan T) (option.Option[T], error) {
16+
select {
17+
case <-ctx.Done():
18+
return option.None[T](), util.WrapCtxErrWithCause(ctx)
19+
case val, ok := <-ch:
20+
if ok {
21+
return option.Some(val), nil
22+
}
23+
24+
return option.None[T](), nil
25+
}
26+
}
27+
28+
// WriteWithDoneCheck takes a context, a channel to write to, and a value to write to that
29+
// channel. It will read from `ctx.Done()` and write to the given channel in a select. If it reads
30+
// an error from the `ctx.Done()` channel, it returns the value of `Err(ctx)`, which includes the
31+
// cancellation cause. If it writes a value to that channel, it returns `nil`.
32+
func WriteWithDoneCheck[T any](ctx context.Context, ch chan<- T, val T) error {
33+
select {
34+
case <-ctx.Done():
35+
return util.WrapCtxErrWithCause(ctx)
36+
case ch <- val:
37+
return nil
38+
}
39+
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/10gen/migration-verifier
22

3-
go 1.22
3+
go 1.24
44

55
require (
66
github.com/cespare/permute/v2 v2.0.0-beta2

internal/testutil/testutil.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,63 @@ package testutil
22

33
import (
44
"context"
5+
"crypto/rand"
6+
"encoding/hex"
7+
"log"
58
"testing"
69

710
"github.com/pkg/errors"
11+
"github.com/samber/lo"
812
"go.mongodb.org/mongo-driver/bson"
913
"go.mongodb.org/mongo-driver/mongo"
1014
"go.mongodb.org/mongo-driver/mongo/options"
1115
"go.mongodb.org/mongo-driver/mongo/readconcern"
1216
)
1317

18+
func generateRandomFieldName(baseName string) string {
19+
b := make([]byte, 4) // 8 hex chars
20+
_, err := rand.Read(b)
21+
if err != nil {
22+
log.Fatal(err)
23+
}
24+
return baseName + "_" + hex.EncodeToString(b)
25+
}
26+
27+
// Returns an agg pipeline that sorts results according to a static list
28+
// of values. Any result that doesn’t match a list value goes at the end.
29+
func SortByListAgg[T any](
30+
fieldName string,
31+
values []T,
32+
) []bson.D {
33+
fieldRef := "$" + fieldName
34+
35+
sortField := generateRandomFieldName("sortOrder")
36+
37+
branches := lo.Map(
38+
values,
39+
func(v T, i int) bson.D {
40+
return bson.D{
41+
{"case", bson.D{{"$eq", bson.A{
42+
fieldRef,
43+
bson.D{{"$literal", v}},
44+
}}}},
45+
{"then", i},
46+
}
47+
},
48+
)
49+
50+
return mongo.Pipeline{
51+
{{"$addFields", bson.D{
52+
{sortField, bson.D{{"$switch", bson.D{
53+
{"branches", branches},
54+
{"default", len(values)},
55+
}}}},
56+
}}},
57+
{{"$sort", bson.D{{sortField, 1}}}},
58+
{{"$project", bson.D{{sortField, 0}}}},
59+
}
60+
}
61+
1462
// Marshal wraps `bsonMarshal` with a panic on failure.
1563
func MustMarshal(doc any) bson.Raw {
1664
raw, err := bson.Marshal(doc)

0 commit comments

Comments
 (0)